Commit 492865a3 authored by yanyan's avatar yanyan
Browse files

faster subm indice generation by

unroll loop for common params
parent 9a23d934
...@@ -59,7 +59,7 @@ Since install newest driver and CUDA is very simple on windows, please use CUDA ...@@ -59,7 +59,7 @@ Since install newest driver and CUDA is very simple on windows, please use CUDA
```Python ```Python
features = # your features with shape [N, numPlanes] features = # your features with shape [N, numPlanes]
indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0] indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0]
spatial_shape = # spatial shape of your sparse tensor. spatial_shape = # spatial shape of your sparse tensor, spatial_shape[i] is shape of indices[:, 1 + i].
batch_size = # batch size of your sparse tensor. batch_size = # batch size of your sparse tensor.
x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size) x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size)
x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor. x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor.
......
#pragma once
#include <cublas_v2.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <class T>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc);
template <class T>
cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc) {
return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
beta, C, ldc);
}
template <class T> inline T constant_scalar(float data) { return T(data); }
template <class T>
cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
const tv::TensorView<T> A, const tv::TensorView<T> B,
tv::TensorView<T> C) {
TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = transa ? A.dim(1) : A.dim(0);
int n = transb ? B.dim(0) : B.dim(1);
int ka = transa ? A.dim(0) : A.dim(1);
int kb = transb ? B.dim(1) : B.dim(0);
int lda = transa ? m : ka;
int ldb = transb ? ka : n;
int ldc = n;
TV_ASSERT_RT_ERR(ka == kb, "error");
T alpha = constant_scalar<T>(1);
T beta = constant_scalar<T>(0);
return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
&alpha, A.data(), lda, B.data(), ldb, &beta,
C.data(), ldc);
}
} // namespace spconv
...@@ -54,9 +54,10 @@ __global__ void prepareIndicePairsKernel( ...@@ -54,9 +54,10 @@ __global__ void prepareIndicePairsKernel(
for (Index i = 0; i < numValidPoints; ++i) { for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1); pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix; indicePairs(0, offset, oldNum) = ix;
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) + index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0); spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index; indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
...@@ -97,9 +98,10 @@ __global__ void prepareDeConvIndicePairsKernel( ...@@ -97,9 +98,10 @@ __global__ void prepareDeConvIndicePairsKernel(
for (Index i = 0; i < numValidPoints; ++i) { for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1); pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix; indicePairs(0, offset, oldNum) = ix;
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) + index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0); spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index; indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
...@@ -190,21 +192,16 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut, ...@@ -190,21 +192,16 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
} }
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void __global__ void prepareSubMGridKernel(
prepareSubMGridKernel(tv::TensorView<const Index> indicesIn, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<IndexGrid> gridsOut, const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index = 0; Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) { for (int ix : tv::KernelLoopX<int>(numActIn)) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1, index =
outSpatialShape.data()) + tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
spatialVolume * indicesIn(ix, 0); indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
gridsOut[index] = ix; gridsOut[index] = ix;
} }
} }
...@@ -258,10 +255,11 @@ __global__ void getSubMIndicePairsKernel( ...@@ -258,10 +255,11 @@ __global__ void getSubMIndicePairsKernel(
for (int i = 0; i < numValidPoints; ++i) { for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1); pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) + index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0); spatialVolume * indicesIn(ix, 0);
if (gridsOut[index] > -1) { if (gridsOut[index] > -1) {
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index]; indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix; indicePairs(0, offset, oldNum) = ix;
} }
...@@ -269,6 +267,82 @@ __global__ void getSubMIndicePairsKernel( ...@@ -269,6 +267,82 @@ __global__ void getSubMIndicePairsKernel(
} }
} }
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
unsigned K2>
__global__ void getSubMIndicePairsKernel3(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index point[3];
Index index = 0;
Index offset;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (3 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
#pragma unroll
for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k;
point[2] = indice_data[3] - k + K2 / 2;
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
point[2] < outSpatialShape[2] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] != -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
}
}
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
__global__ void getSubMIndicePairsKernel2(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index point[2];
Index index = 0;
Index offset;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (2 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
offset = i * K1 + j;
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] > -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
}
}
}
}
}
}
template <typename Index, unsigned NDim, int KernelMaxVolume = 256, template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
unsigned kNumHashFunctions = 4> unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashKernel( __global__ void getSubMIndicePairsHashKernel(
...@@ -299,12 +373,13 @@ __global__ void getSubMIndicePairsHashKernel( ...@@ -299,12 +373,13 @@ __global__ void getSubMIndicePairsHashKernel(
for (int i = 0; i < numValidPoints; ++i) { for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1); pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) + index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0); spatialVolume * indicesIn(ix, 0);
auto val = cuhash::retrieve((unsigned)(index), table_size, table, auto val = cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count); constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) { if (val != cuhash::kNotFound) {
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = val; indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix; indicePairs(0, offset, oldNum) = ix;
} }
...@@ -332,7 +407,7 @@ __global__ void ...@@ -332,7 +407,7 @@ __global__ void
resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut, resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
int numAct) { int numAct) {
int outSpatialShapeReg[NDim]; Index outSpatialShapeReg[NDim];
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
outSpatialShapeReg[i] = outSpatialShape[i]; outSpatialShapeReg[i] = outSpatialShape[i];
} }
...@@ -345,11 +420,14 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut, ...@@ -345,11 +420,14 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
Index index; Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) { for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1); indsPtr = indices + ix * (NDim + 1);
index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg); index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
outSpatialShapeReg, 0);
gridsOut[index + spatialVolume * indsPtr[0]] = -1; gridsOut[index + spatialVolume * indsPtr[0]] = -1;
} }
} }
} // namespace spconv } // namespace spconv
#undef atomicAdd
#endif #endif
\ No newline at end of file
...@@ -18,65 +18,6 @@ ...@@ -18,65 +18,6 @@
#include <torch/script.h> #include <torch/script.h>
namespace spconv { namespace spconv {
namespace functor {
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1 {
Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2 {
Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false, bool useHash = true);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor {
Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false, bool useHash = true);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor {
Index operator()(const Device &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false, bool useHash = true);
};
} // namespace functor
int create_conv_indice_pair_p1_cuda( int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize, torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#define REORDERING_CU_H_ #define REORDERING_CU_H_
#include <THC/THCAtomics.cuh> #include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh> #include <THC/THCNumerics.cuh>
#include <tensorview/kernel_utils.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500 #if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd #define TH_ATOMIC_ADD atomicAdd
...@@ -299,7 +299,7 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, ...@@ -299,7 +299,7 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
__hadd2(reinterpret_cast<__half2 *>(buf)[i], __hadd2(reinterpret_cast<__half2 *>(buf)[i],
reinterpret_cast<__half2 *>(buf2)[i]); reinterpret_cast<__half2 *>(buf2)[i]);
} }
#else #else
#pragma unroll #pragma unroll
for (int i = 0; i < vecloadFactor; i++) { for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i]; buf[i] += buf2[i];
...@@ -319,8 +319,8 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, ...@@ -319,8 +319,8 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer, __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, const Index *indices, int size,
int numPlanes) { int numPlanes) {
int ILPStrideX[NumILP]; int ILPStrideX[NumILP];
#pragma unroll #pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) for (int ilp = 0; ilp < NumILP; ilp++)
...@@ -330,7 +330,8 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer, ...@@ -330,7 +330,8 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
for (int ix : tv::KernelLoopX<int, NumILP>(size)) { for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll #pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) { for (int ilp = 0; ilp < NumILP; ++ilp) {
outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] += buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]; outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
} }
} }
} }
...@@ -338,8 +339,8 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer, ...@@ -338,8 +339,8 @@ __global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer, __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, const Index *indices, int size,
int numPlanes) { int numPlanes) {
int ILPStrideX[NumILP]; int ILPStrideX[NumILP];
#pragma unroll #pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) for (int ilp = 0; ilp < NumILP; ilp++)
...@@ -351,9 +352,10 @@ __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer, ...@@ -351,9 +352,10 @@ __global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
#pragma unroll #pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) { for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y; idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<__half2 *>(outFeatures)[idx] = reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
__hadd2(reinterpret_cast<__half2 *>(outFeatures)[idx], reinterpret_cast<__half2 *>(outFeatures)[idx],
reinterpret_cast<__half2 *>(buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]); reinterpret_cast<__half2 *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
} }
} }
} }
......
...@@ -26,321 +26,15 @@ namespace spconv { ...@@ -26,321 +26,15 @@ namespace spconv {
enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 }; enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
// torch.jit's doc says only support int64, so we need to convert to int32. // torch.jit's doc says only support int64, so we need to convert to int32.
template <unsigned NDim>
std::vector<torch::Tensor> std::vector<torch::Tensor>
getIndicePair(torch::Tensor indices, int64_t batchSize, getIndicePairs(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash) { int64_t _transpose, int64_t _useHash);
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
// CPU always use hash (tsl::robin_map).
bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
std::string msg = "due to limits of cuda hash, the volume of dense space "
"include batch size ";
msg += "must less than std::numeric_limits<int>::max() = 2e9";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
msg);
torch::Tensor indicePairs =
torch::full({2, kernelVolume, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
auto gridSize = batchSize * outputVolume;
if (useHash) {
gridSize = batchSize;
}
torch::Tensor gridOut = torch::full(
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
if (numActOut == -1) {
// build hash failed. use CPU algorithm
auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
return {indices.to(device), indicePairs.to(device),
indiceNum.to(device)};
}
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {indices, indicePairs, indiceNum};
} else {
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
false, useHash);
if (numActOut == -1) {
// build hash failed. use CPU algorithm
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
auto device = indices.device();
outInds = outInds.to({torch::kCPU});
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
return {outInds.to(device).slice(0, 0, numActOut),
indicePairs.to(device), indiceNum.to(device)};
}
}
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
std::vector<torch::Tensor>
getIndicePairV2(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash);
template <unsigned NDim>
std::vector<torch::Tensor> getIndicePairPreGrid(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose,
int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
bool useHash = _useHash != 0;
TV_ASSERT_RT_ERR(!useHash, "error");
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
}
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut, torch::Tensor indiceNum, int64_t numActOut,
......
...@@ -66,7 +66,7 @@ public: ...@@ -66,7 +66,7 @@ public:
class InferenceContext { class InferenceContext {
public: public:
explicit InferenceContext(const std::string& engine_bin, int device) explicit InferenceContext(const std::string &engine_bin, int device)
: logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) { : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id"); TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
int deviceCount; int deviceCount;
...@@ -159,9 +159,9 @@ public: ...@@ -159,9 +159,9 @@ public:
return (*this)(inputs_vec); return (*this)(inputs_vec);
} }
tv::Tensor operator[](std::string name){ tv::Tensor operator[](std::string name) {
auto iter = name_to_host_mem_.find(name); auto iter = name_to_host_mem_.find(name);
if (iter == name_to_host_mem_.end()){ if (iter == name_to_host_mem_.end()) {
TV_THROW_INVALID_ARG(name, "not found."); TV_THROW_INVALID_ARG(name, "not found.");
} }
return iter->second; return iter->second;
...@@ -204,4 +204,4 @@ private: ...@@ -204,4 +204,4 @@ private:
int device_; int device_;
}; };
} // namespace trtplus } // namespace trt
...@@ -23,6 +23,9 @@ template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) { ...@@ -23,6 +23,9 @@ template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
} // namespace detail } // namespace detail
template <class... T>
using mp_length = std::integral_constant<std::size_t, sizeof...(T)>;
namespace detail { namespace detail {
template <class A, template <class...> class B> struct mp_rename_impl { template <class A, template <class...> class B> struct mp_rename_impl {
...@@ -40,6 +43,8 @@ struct mp_rename_impl<A<T...>, B> { ...@@ -40,6 +43,8 @@ struct mp_rename_impl<A<T...>, B> {
template <class A, template <class...> class B> template <class A, template <class...> class B>
using mp_rename = typename detail::mp_rename_impl<A, B>::type; using mp_rename = typename detail::mp_rename_impl<A, B>::type;
template <class L> using mp_size = mp_rename<L, mp_length>;
template <class L, class F> constexpr F mp_for_each(F &&f) { template <class L, class F> constexpr F mp_for_each(F &&f) {
return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f)); return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
} }
......
...@@ -56,9 +56,15 @@ using dtype_collection_t = ...@@ -56,9 +56,15 @@ using dtype_collection_t =
tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8, tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
float16, int64, uint16, uint32, uint64>; float16, int64, uint16, uint32, uint64>;
#ifdef TV_CUDA
using all_tensor_types_t = using all_tensor_types_t =
std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t, std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
uint16_t, uint32_t, uint64_t, bool>; uint16_t, uint32_t, uint64_t, bool>;
#else
using all_tensor_types_t =
std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
uint16_t, uint32_t, uint64_t, bool>;
#endif
template <typename T> class TensorStorage { template <typename T> class TensorStorage {
public: public:
...@@ -395,6 +401,63 @@ void dispatch_int(int idx, BinaryPredicate p, F &&f) { ...@@ -395,6 +401,63 @@ void dispatch_int(int idx, BinaryPredicate p, F &&f) {
} }
} }
// Ts is pack of mp_list_c
template <class... Ts, typename Iterator, typename F>
bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
static_assert(sizeof...(Ts) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
using val_lst_t = decltype(I);
auto val_lst_size = mp_size<val_lst_t>::value;
bool equal = true;
std::size_t count = 0;
auto iter = begin;
mp_for_each<val_lst_t>([&](auto E) {
if (iter == end || !equal) {
return;
}
if (count >= val_lst_size) {
TV_THROW_INVALID_ARG("iterator length invalid:", val_lst_size);
}
constexpr auto c = decltype(E)::value;
if (c != *iter) {
equal = false;
}
++count;
std::advance(iter, 1);
});
if (count != val_lst_size || iter != end) {
equal = false;
}
if (equal && notFound) {
std::forward<F>(f)(I);
notFound = false;
}
});
return !notFound;
}
template <class... Ts, typename Iterator, typename F>
void dispatch_container(Iterator begin, Iterator end, F &&f) {
if (!dispatch_container_noexcept<Ts...>(begin, end, std::forward<F>(f))) {
std::stringstream ss;
ss << "unknown value [";
for (auto iter = begin; iter != end; std::advance(iter, 1)) {
ss << *iter << ",";
}
ss << "], available: ";
mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
ss << "[";
mp_for_each<decltype(I)>(
[=, &ss](auto E) { ss << decltype(E)::value << ","; });
ss << "]";
});
TV_THROW_RT_ERR(ss.str());
}
}
/* /*
template <int... Is, typename F> void dispatch_int(int idx, F &&f) { template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
return dispatch_scalar<int, Is...>(idx, f); return dispatch_scalar<int, Is...>(idx, f);
...@@ -410,6 +473,26 @@ struct Dispatch<T<Args...>> { ...@@ -410,6 +473,26 @@ struct Dispatch<T<Args...>> {
} }
}; };
template <class T> struct DispatchContainer;
template <template <class...> class T, class... Args>
struct DispatchContainer<T<Args...>> {
template <typename Iterator, typename F>
inline void operator()(Iterator begin, Iterator end, F &&f) {
return dispatch_container<Args...>(begin, end, std::forward<F>(f));
}
};
template <class T> struct DispatchContainerNoexcept;
template <template <class...> class T, class... Args>
struct DispatchContainerNoexcept<T<Args...>> {
template <typename Iterator, typename F>
inline bool operator()(Iterator begin, Iterator end, F &&f) {
return dispatch_container_noexcept<Args...>(begin, end, std::forward<F>(f));
}
};
template <class T> struct DispatchInt; template <class T> struct DispatchInt;
// Args should be std::integral_constant<int, value> // Args should be std::integral_constant<int, value>
...@@ -531,13 +614,11 @@ struct Tensor { ...@@ -531,13 +614,11 @@ struct Tensor {
typename Tindex = int, typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0> typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<T, Rank, PtrTraits, Tindex> tview() { TensorView<T, Rank, PtrTraits, Tindex> tview() {
using tv_shape_t =
typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
writable_check(); writable_check();
static_assert(Rank == -1 || Rank > 0, "error"); static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error"); TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim()); ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (int i = 0; i < ndim(); ++i) { for (size_t i = 0; i < ndim(); ++i) {
shape[i] = shape_[i]; shape[i] = shape_[i];
stride[i] = stride_[i]; stride[i] = stride_[i];
} }
...@@ -579,7 +660,7 @@ struct Tensor { ...@@ -579,7 +660,7 @@ struct Tensor {
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error"); TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim()); ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (int i = 0; i < ndim(); ++i) { for (int i = 0; i < int(ndim()); ++i) {
shape[i] = shape_[i]; shape[i] = shape_[i];
stride[i] = stride_[i]; stride[i] = stride_[i];
} }
...@@ -621,6 +702,21 @@ struct Tensor { ...@@ -621,6 +702,21 @@ struct Tensor {
return res; return res;
} }
Tensor operator[](int64_t index) {
TV_ASSERT_INVALID_ARG(ndim() > 1, "error");
if (index < 0) {
index += dim(0);
}
TV_ASSERT_INVALID_ARG(index < dim(0), "error");
Tensor res = Tensor();
res.storage_ = storage_;
res.shape_ = shape_.subshape(1);
res.offset_ = offset_ + index * stride_[0];
res.stride_ = stride_.subshape(1);
res.writeable_ = writeable_;
return res;
}
Tensor squeeze() const { return view(shape_.squeeze()); } Tensor squeeze() const { return view(shape_.squeeze()); }
Tensor squeeze(int axis) const { Tensor squeeze(int axis) const {
...@@ -665,6 +761,7 @@ struct Tensor { ...@@ -665,6 +761,7 @@ struct Tensor {
size_t ndim() const { return shape_.ndim(); } size_t ndim() const { return shape_.ndim(); }
const TensorShape &shape() const { return shape_; } const TensorShape &shape() const { return shape_; }
const TensorShape &sizes() const { return shape_; }
const TensorShape &stride() const { return stride_; } const TensorShape &stride() const { return stride_; }
int dim(int idx) const { int dim(int idx) const {
...@@ -679,6 +776,7 @@ struct Tensor { ...@@ -679,6 +776,7 @@ struct Tensor {
const uint8_t *raw_data() const { return storage_->data() + offset_; } const uint8_t *raw_data() const { return storage_->data() + offset_; }
size_t raw_size() const { return size() * itemsize(); } size_t raw_size() const { return size() * itemsize(); }
size_t size() const { return shape_.size(); } size_t size() const { return shape_.size(); }
size_t size(int64_t idx) const { return dim(idx); }
size_t itemsize() const { return detail::sizeof_dtype(dtype_); } size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
Tensor &zero_() { Tensor &zero_() {
writable_check(); writable_check();
...@@ -716,6 +814,16 @@ struct Tensor { ...@@ -716,6 +814,16 @@ struct Tensor {
return reinterpret_cast<const T *>(raw_data()); return reinterpret_cast<const T *>(raw_data());
} }
template <typename T> T *data_ptr() { return data<T>(); }
template <typename T> const T *data_ptr() const { return data<T>(); }
void *data_ptr() { return reinterpret_cast<void *>(raw_data()); }
const void *data_ptr() const {
return reinterpret_cast<const void *>(raw_data());
}
void copy_(const Tensor &tensor) { void copy_(const Tensor &tensor) {
writable_check(); writable_check();
TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now"); TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
...@@ -837,8 +945,8 @@ struct Tensor { ...@@ -837,8 +945,8 @@ struct Tensor {
using Tcur = decltype(Icur); using Tcur = decltype(Icur);
if (std::is_convertible<Tcur, Tdst>::value) { if (std::is_convertible<Tcur, Tdst>::value) {
auto ptr = this->data<Tcur>(); auto ptr = this->data<Tcur>();
tensor = Tensor(this->shape_, this->stride_, dtype, this->device(), this->pinned(), tensor = Tensor(this->shape_, this->stride_, dtype, this->device(),
this->storage_->managed()); this->pinned(), this->storage_->managed());
std::copy(ptr, ptr + this->size(), tensor.data<Tdst>()); std::copy(ptr, ptr + this->size(), tensor.data<Tdst>());
} else { } else {
TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to", TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",
......
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "common.h"
#include "prettyprint.h"
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cstdlib> #include <cstdlib>
#include "common.h"
#include "prettyprint.h"
#include <iostream> #include <iostream>
#include <iterator>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <type_traits> #include <type_traits>
...@@ -121,6 +121,15 @@ constexpr size_t calc_align(size_t ndim) ...@@ -121,6 +121,15 @@ constexpr size_t calc_align(size_t ndim)
return 64; return 64;
} }
*/ */
namespace detail {
template <typename _InIter>
using _RequireInputIter = typename std::enable_if<std::is_convertible<
typename std::iterator_traits<_InIter>::iterator_category,
std::input_iterator_tag>::value>::type;
}
template <typename T, size_t MaxDim = TV_MAX_DIM> template <typename T, size_t MaxDim = TV_MAX_DIM>
struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector { struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
public: public:
...@@ -131,7 +140,8 @@ public: ...@@ -131,7 +140,8 @@ public:
array_[i] = init; array_[i] = init;
} }
}; };
template <typename Iterator> SimpleVector(Iterator first, Iterator last) { template <typename Iterator, typename = detail::_RequireInputIter<Iterator>>
SimpleVector(Iterator first, Iterator last) {
size_ = 0; size_ = 0;
for (; first != last; ++first) { for (; first != last; ++first) {
if (size_ >= MaxDim) { if (size_ >= MaxDim) {
...@@ -588,6 +598,12 @@ template <int N, int Ndim> struct ArrayIndexRowMajor { ...@@ -588,6 +598,12 @@ template <int N, int Ndim> struct ArrayIndexRowMajor {
return ArrayIndexRowMajor<N - 1, Ndim>::runShape( return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
shape, (index + start) * shape[Ndim - N + 1], inds...); shape, (index + start) * shape[Ndim - N + 1], inds...);
} }
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned
runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
return ArrayIndexRowMajor<N - 1, Ndim>::runPtrs(
indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);
}
}; };
template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> { template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
...@@ -601,6 +617,11 @@ template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> { ...@@ -601,6 +617,11 @@ template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
Tinit start, T idx) { Tinit start, T idx) {
return start + idx; return start + idx;
} }
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned
runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
return start + indexes[Ndim - 1];
}
}; };
template <> struct ArrayIndexRowMajor<0, 0> { template <> struct ArrayIndexRowMajor<0, 0> {
...@@ -613,6 +634,11 @@ template <> struct ArrayIndexRowMajor<0, 0> { ...@@ -613,6 +634,11 @@ template <> struct ArrayIndexRowMajor<0, 0> {
Tinit start) { Tinit start) {
return 0; return 0;
} }
template <typename TShape, typename Tinit>
TV_HOST_DEVICE_INLINE static unsigned
runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
return 0;
}
}; };
template <int N, int Ndim> struct ArrayIndexStride { template <int N, int Ndim> struct ArrayIndexStride {
...@@ -744,8 +770,8 @@ struct TensorAccesserBase { ...@@ -744,8 +770,8 @@ struct TensorAccesserBase {
} }
protected: protected:
const Tindex *stride_ptr_;
ptr_t ptr_; ptr_t ptr_;
const Tindex *stride_ptr_;
}; };
} // namespace detail } // namespace detail
...@@ -1091,8 +1117,7 @@ struct TensorView { ...@@ -1091,8 +1117,7 @@ struct TensorView {
return TensorView<T, Rank, PtrTraits, Tindex>( return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size())); ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
} }
template <typename Os> template <typename Os> std::string repr(Os &ss) const {
std::string repr(Os &ss, int limit = 1000, int limit_axis = 6) const {
if (empty()) if (empty())
return ""; return "";
if (shape_.ndim() == 0) { if (shape_.ndim() == 0) {
...@@ -1100,7 +1125,6 @@ struct TensorView { ...@@ -1100,7 +1125,6 @@ struct TensorView {
ss << *ptr_; ss << *ptr_;
return ss.str(); return ss.str();
} }
bool enable_limit = size() > limit;
SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1); SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim()); SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
...@@ -1111,7 +1135,7 @@ struct TensorView { ...@@ -1111,7 +1135,7 @@ struct TensorView {
ss << "Tensor[" << type_s<T> << "]: shape=" << shape() ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
<< ", stride=" << stride() << std::endl; << ", stride=" << stride() << std::endl;
auto ndimValue = ndim(); auto ndimValue = ndim();
for (int64_t i = 0; i < size(); ++i) { for (int64_t i = 0; i < int64_t(size()); ++i) {
rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue); rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
bool newline = false; bool newline = false;
int end_count = 0; int end_count = 0;
......
...@@ -133,7 +133,6 @@ torch::Tensor torch_slice_first_axis(torch::Tensor tensor, T start, T end) { ...@@ -133,7 +133,6 @@ torch::Tensor torch_slice_first_axis(torch::Tensor tensor, T start, T end) {
auto tensor_shape = tensor.sizes(); auto tensor_shape = tensor.sizes();
std::vector<int64_t> shape(tensor_shape.begin(), tensor_shape.end()); std::vector<int64_t> shape(tensor_shape.begin(), tensor_shape.end());
shape[0] = end - start; shape[0] = end - start;
auto dtype = tensor.scalar_type();
uint8_t *ptr = reinterpret_cast<uint8_t *>(tensor.data_ptr()); uint8_t *ptr = reinterpret_cast<uint8_t *>(tensor.data_ptr());
res = torch::from_blob(ptr + start * tensor.stride(0) * tensor.itemsize(), res = torch::from_blob(ptr + start * tensor.stride(0) * tensor.itemsize(),
torch::IntArrayRef(shape), tensor.options()); torch::IntArrayRef(shape), tensor.options());
......
...@@ -96,8 +96,9 @@ class SparseConvTensor(object): ...@@ -96,8 +96,9 @@ class SparseConvTensor(object):
def dense(self, channels_first=True): def dense(self, channels_first=True):
output_shape = [self.batch_size] + list( output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]] self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(self.indices.long().to(self.features.device), res = scatter_nd(
self.features, output_shape) self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first: if not channels_first:
return res return res
ndim = len(self.spatial_shape) ndim = len(self.spatial_shape)
......
...@@ -20,9 +20,9 @@ import spconv ...@@ -20,9 +20,9 @@ import spconv
class ConvAlgo(Enum): class ConvAlgo(Enum):
Native = 0 # small memory cost, faster when number of points is large. Native = 0 # small memory cost, faster when number of points is large.
Batch = 1 # high memory cost, faster when number of points is small (< 50000) Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium BatchGemmGather = 2 # high memory cost, faster when number of points medium
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation): def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
...@@ -88,12 +88,11 @@ def get_indice_pairs(indices, ...@@ -88,12 +88,11 @@ def get_indice_pairs(indices,
else: else:
out_shape = spatial_shape out_shape = spatial_shape
if grid is None: if grid is None:
res = torch.ops.spconv.get_indice_pairs_v2(indices, batch_size, res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,
out_shape, spatial_shape, spatial_shape, ksize, stride,
ksize, stride, padding, padding, dilation, out_padding,
dilation, out_padding, int(subm), int(transpose),
int(subm), int(transpose), int(use_hash))
int(use_hash))
return res return res
else: else:
if ndim == 2: if ndim == 2:
......
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc) set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
if (SPCONV_BuildCUDA) if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu) set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
endif() endif()
add_library(spconv SHARED ${ALL_FILES}) add_library(spconv SHARED ${ALL_FILES})
......
...@@ -21,14 +21,7 @@ ...@@ -21,14 +21,7 @@
static auto registry = static auto registry =
torch::RegisterOperators() torch::RegisterOperators()
.op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>) .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
.op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
.op("spconv::get_indice_pairs_v2", &spconv::getIndicePairV2)
.op("spconv::get_indice_pairs_grid_2d",
&spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d",
&spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv", &spconv::indiceConv) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_batch", &spconv::indiceConvBatch) .op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
......
#include <ATen/ATen.h>
#include <spconv/cublas_gemm.h>
namespace spconv {
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *A, int lda,
const float *B, int ldb, const float *beta, float *C,
int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const __half *alpha, const __half *A, int lda,
const __half *B, int ldb, const __half *beta,
__half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const at::Half *alpha, const at::Half *A, int lda,
const at::Half *B, int ldb, const at::Half *beta,
at::Half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k,
reinterpret_cast<const __half *>(alpha),
reinterpret_cast<const __half *>(A), lda,
reinterpret_cast<const __half *>(B), ldb,
reinterpret_cast<const __half *>(beta),
reinterpret_cast<__half *>(C), ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const double *alpha, const double *A, int lda,
const double *B, int ldb, const double *beta,
double *C, int ldc) {
return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <> inline __half constant_scalar(float data) {
return __float2half(data);
}
} // namespace spconv
\ No newline at end of file
...@@ -327,67 +327,4 @@ int create_submconv_indice_pair_cpu( ...@@ -327,67 +327,4 @@ int create_submconv_indice_pair_cpu(
return numActIn; return numActIn;
} }
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
else
return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
} // namespace spconv } // namespace spconv
...@@ -20,10 +20,11 @@ ...@@ -20,10 +20,11 @@
#include <spconv/indice.h> #include <spconv/indice.h>
#include <tensorview/cuda_utils.h> #include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h> #include <tensorview/mp_helper.h>
#include <tensorview/torch_utils.h>
#include <tensorview/tensor.h> #include <tensorview/tensor.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <type_traits> #include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
...@@ -78,10 +79,12 @@ int create_conv_indice_pair_p1_cuda( ...@@ -78,10 +79,12 @@ int create_conv_indice_pair_p1_cuda(
} }
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim, MaxKernelVolume, ">", attr.numRegs); &attr,
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
MaxKernelVolume, ">", attr.numRegs);
#endif #endif
}); });
}); });
}); });
...@@ -166,13 +169,16 @@ int create_conv_indice_pair_p2_cuda( ...@@ -166,13 +169,16 @@ int create_conv_indice_pair_p2_cuda(
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim, ">", attr.numRegs); &attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
">", attr.numRegs);
cudaFuncAttributes attr2; cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes(&attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">", attr2.numRegs); &attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
#endif #endif
} }
if (resetGrid && (!useHash)) { if (resetGrid && (!useHash)) {
...@@ -187,6 +193,10 @@ int create_conv_indice_pair_p2_cuda( ...@@ -187,6 +193,10 @@ int create_conv_indice_pair_p2_cuda(
return numAct; return numAct;
} }
template <typename T> struct is_valid {
__device__ __forceinline__ bool operator()(const T x) { return x != -1; }
};
int create_submconv_indice_pair_cuda( int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs, torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize, torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
...@@ -212,6 +222,10 @@ int create_submconv_indice_pair_cuda( ...@@ -212,6 +222,10 @@ int create_submconv_indice_pair_cuda(
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end()); tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(), tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end()); outSpatialShape.end());
Index spatialVolume = 1;
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
if (useHash) { if (useHash) {
auto table = cuhash::HashTable(); auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl; // std::cout << "create " << numAct << " size table..." << std::endl;
...@@ -251,23 +265,79 @@ int create_submconv_indice_pair_cuda( ...@@ -251,23 +265,79 @@ int create_submconv_indice_pair_cuda(
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed"); TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
}); });
} else { } else {
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim> prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn), stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), ou); tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
// tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed"); TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
tv::DispatchInt<max_kernel_vol_t>()( // when dilation all one, we use a simple kernel to calc result
ndim, std::less_equal<int>(), [&](auto I2) { bool dilation_one = true;
constexpr int MaxKernelVolume = decltype(I2)::value; for (int i = 0; i < NDim; ++i) {
getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume> dilation_one &= di[i] == 1;
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, }
0, stream>>>(tv::torch2tv<Index>(indicesIn), auto found = false;
tv::torch2tv<IndexGrid>(gridsOut), if (dilation_one && (NDim == 2 || NDim == 3)) {
tv::torch2tv<Index>(indicePairs), auto indiceNumCpu = indiceNum.cpu();
tv::torch2tv<Index>(indiceNum), ks, st, pa, if (NDim == 2) {
di, ou); tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); outSpatialShape.end());
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
found = true;
getSubMIndicePairsKernel2<Index, IndexGrid, K0, K1>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
});
});
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
tv::dispatch_int_noexcept<1, 3, 5>(
kernelSize[2], [&](auto K2C) {
constexpr int K0 = decltype(K0C)::value;
constexpr int K1 = decltype(K1C)::value;
constexpr int K2 = decltype(K2C)::value;
found = true;
getSubMIndicePairsKernel3<Index, IndexGrid, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
});
}); });
}
}
if (!found) {
tv::DispatchInt<
max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
ou);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
});
}
// tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
} }
if (resetGrid && (!useHash)) { if (resetGrid && (!useHash)) {
...@@ -282,209 +352,4 @@ int create_submconv_indice_pair_cuda( ...@@ -282,209 +352,4 @@ int create_submconv_indice_pair_cuda(
return numActIn; return numActIn;
} }
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose) {
Index batchSize = gridsOut.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// auto timer = spconv::CudaContextTimer<>();
if (transpose)
prepareDeConvIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum,
indicePairUnique, kernelSize, stride, padding,
dilation, outSpatialShape);
else
prepareIndicePairsKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum,
indicePairUnique, kernelSize, stride, padding,
dilation, outSpatialShape);
TV_CHECK_CUDA_ERR();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return 1;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
Index batchSize = gridsOut.dim(0);
auto kernelVolume = indiceNum.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// after unique, there is a std::numeric_limits<int>::max() in the end of
// indicePairUnique
Index numAct = indicePairUnique.dim(0) - 1;
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(d_values, numAct);
bool res = table.Build(
numAct, reinterpret_cast<unsigned *>(indicePairUnique.data()),
d_values);
cudaFree(d_values);
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel<Index, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numAct, indicePairUnique,
outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numActIn, indicePairs,
indicePairUnique, tableSize, tableData, constants,
stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
} else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
return numAct;
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// auto timer = spconv::CudaContextTimer<>();
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "subm create " << numActIn << " size table..." <<
// std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
prepareSubMHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, d_keyvalues, d_values,
outSpatialShape);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res =
table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_keyvalues);
if (!res) {
return -1; // use -1 to tell outside use CPU implementation
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum, kernelSize,
stride, padding, dilation, outSpatialShape,
tableSize, tableData, constants, stash_constants,
stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
} else {
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation,
outSpatialShape);
TV_CHECK_CUDA_ERR();
}
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
numActIn);
TV_CHECK_CUDA_ERR();
}
return numActIn;
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \
NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX(int);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
} // namespace spconv } // namespace spconv
\ No newline at end of file
...@@ -30,10 +30,12 @@ namespace spconv { ...@@ -30,10 +30,12 @@ namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>; using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>; using int_types_t = tv::mp_list<int32_t, int64_t>;
template <typename T> template <typename T>
using half_vec_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>; using half_vec_t =
std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
template <typename T> template <typename T>
using half_vec_sadd_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>; using half_vec_sadd_t =
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>; std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) { torch::Tensor indices, int size) {
...@@ -51,49 +53,55 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -51,49 +53,55 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>( tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
[=, &buffer, &features, &indices, &notFound](auto NumTLP) { &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int nHotBlock = (size / NumTLP) * NumTLP; int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) { if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t> vecload_type_t>
<<<dim3(size / NumTLP, numPlanes / NumTLP), <<<dim3(size / NumTLP, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0, dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(), stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock, indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, checkCudaErrors(cudaFuncGetAttributes(
vecload_type_t>)); &attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs); vecload_type_t>));
tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif #endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
if (size - nHotBlock > 0) { if (size - nHotBlock > 0) {
gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP), <<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0, dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes, stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>() + nHotBlock, indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor); size - nHotBlock, numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs); &attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
int(NumTLP), NumILP, ">", attr.numRegs);
#endif #endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
}
notFound = false;
}
} }
}); notFound = false;
}
}
});
if (notFound) { if (notFound) {
constexpr int NumTLP = 64; constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
...@@ -105,8 +113,10 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -105,8 +113,10 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
indices.data_ptr<Index>(), size, numPlanes); indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, gatherGenericKernel<T, Index, NumTLP, NumILP>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs); &attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
int(NumTLP), NumILP, ">", attr.numRegs);
#endif #endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
...@@ -151,9 +161,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -151,9 +161,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP, checkCudaErrors(cudaFuncGetAttributes(
vecload_type_t>)); &attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs); vecload_type_t>));
tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif #endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
...@@ -167,8 +180,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -167,8 +180,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
size - nHotBlock, numPlanes); size - nHotBlock, numPlanes);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs); &attr,
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">",
attr.numRegs);
#endif #endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -187,8 +204,10 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -187,8 +204,10 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
indices.data_ptr<Index>(), size, numPlanes); indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>)); checkCudaErrors(cudaFuncGetAttributes(
tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs); &attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif #endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
namespace spconv { namespace spconv {
std::vector<torch::Tensor> std::vector<torch::Tensor>
getIndicePairV2(torch::Tensor indices, int64_t batchSize, getIndicePairs(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash) { int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0; bool subM = _subM != 0;
bool transpose = _transpose != 0; bool transpose = _transpose != 0;
...@@ -57,6 +57,7 @@ getIndicePairV2(torch::Tensor indices, int64_t batchSize, ...@@ -57,6 +57,7 @@ getIndicePairV2(torch::Tensor indices, int64_t batchSize,
stride[i] = 1; stride[i] = 1;
} }
} }
// tv::ssprint("prepare", timer.report() / 1000.0);
if (subM) { if (subM) {
if (indices.device().type() == torch::kCPU) { if (indices.device().type() == torch::kCPU) {
numActOut = create_submconv_indice_pair_cpu( numActOut = create_submconv_indice_pair_cpu(
...@@ -85,6 +86,7 @@ getIndicePairV2(torch::Tensor indices, int64_t batchSize, ...@@ -85,6 +86,7 @@ getIndicePairV2(torch::Tensor indices, int64_t batchSize,
else { else {
TV_THROW_INVALID_ARG("unknown device type"); TV_THROW_INVALID_ARG("unknown device type");
} }
// tv::ssprint("subm", timer.report() / 1000.0);
return {indices, indicePairs, indiceNum}; return {indices, indicePairs, indiceNum};
} else { } else {
auto indicePairUnique = torch::full( auto indicePairUnique = torch::full(
...@@ -193,6 +195,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -193,6 +195,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
// TODO torch::from_blob is a little slow
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(), auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = auto inputBufferBlob =
...@@ -406,11 +409,11 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -406,11 +409,11 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::TensorOptions().dtype(features.dtype()).device(features.device()); torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes(); auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options); torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options); torch::Tensor filtersGrad = torch::empty(filterShape, options);
torch::Tensor inputBuffer = torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options); torch::empty({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes}); filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes}); filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment