Commit 3517290c authored by yanyan's avatar yanyan
Browse files

format code, add benchmark per layer

parent 540a2209
...@@ -198,7 +198,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, ...@@ -198,7 +198,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
if (nHot > o) \ if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> \ dConvolution_KMxKN_forwardB<T, K, V> \
<<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \ <<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \ inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \ nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \ output_stride); \
return; \ return; \
...@@ -400,8 +400,8 @@ __global__ void dConvolution_KMxKN_backward_dW_B( ...@@ -400,8 +400,8 @@ __global__ void dConvolution_KMxKN_backward_dW_B(
if (nHot > o) \ if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> \ dConvolution_KMxKN_backward_dW_B<T, K, V> \
<<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \ <<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \ inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \
rulesOut + o, nHot - o, input_nPlanes, input_stride, \ rulesOut + o, nHot - o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \ output_nPlanes, output_stride); \
return; \ return; \
} \ } \
......
...@@ -21,15 +21,16 @@ ...@@ -21,15 +21,16 @@
namespace spconv { namespace spconv {
template <bool UseDeconv, typename Index, unsigned NDim> struct ConvIndiceDispatch; template <bool UseDeconv, typename Index, unsigned NDim>
struct ConvIndiceDispatch;
template <typename Index, unsigned NDim> template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<true, Index, NDim>{ struct ConvIndiceDispatch<true, Index, NDim> {
constexpr static auto* func = getValidOutPosTranspose<Index, NDim>; constexpr static auto *func = getValidOutPosTranspose<Index, NDim>;
}; };
template <typename Index, unsigned NDim> template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<false, Index, NDim>{ struct ConvIndiceDispatch<false, Index, NDim> {
constexpr static auto* func = getValidOutPos<Index, NDim>; constexpr static auto *func = getValidOutPos<Index, NDim>;
}; };
template <typename Index, unsigned NDim, bool UseDeconv, template <typename Index, unsigned NDim, bool UseDeconv,
...@@ -61,8 +62,8 @@ __global__ void prepareIndicePairsKernel( ...@@ -61,8 +62,8 @@ __global__ void prepareIndicePairsKernel(
for (int ix : tv::KernelLoopX<int>(numActIn)) { for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func( numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
outSpatialShape.data(), validPoints); validPoints);
for (Index i = 0; i < numValidPoints; ++i) { for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1); pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
......
...@@ -89,7 +89,6 @@ __global__ void matmul(const Dtype *A, const int wA, const int hA, ...@@ -89,7 +89,6 @@ __global__ void matmul(const Dtype *A, const int wA, const int hA,
// C[wB * out_row + x] += Csub; // C[wB * out_row + x] += Csub;
} }
template <typename Dtype, typename Itype, int BLOCK_SIZE> template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul2(const Dtype *A, const int wA, const int hA, __global__ void matmul2(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB, const Dtype *B, const int wB, const int hB,
......
...@@ -7,10 +7,8 @@ ...@@ -7,10 +7,8 @@
namespace spconv { namespace spconv {
template <typename Index, unsigned NDim> template <typename Index, unsigned NDim>
__global__ void scatterPointToGridKernel( __global__ void scatterPointToGridKernel(
tv::TensorView<const float> points, tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
tv::TensorView<const Index> indexes, tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndex, tv::TensorView<Index> pointIndex,
const tv::SimpleVector<Index, NDim> gridShape) { const tv::SimpleVector<Index, NDim> gridShape) {
Index index; Index index;
...@@ -19,24 +17,25 @@ __global__ void scatterPointToGridKernel( ...@@ -19,24 +17,25 @@ __global__ void scatterPointToGridKernel(
for (int ix : tv::KernelLoopX<int>(numPoints)) { for (int ix : tv::KernelLoopX<int>(numPoints)) {
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs( index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indexes.data() + ix * NDim, gridShape.data(), 0); indexes.data() + ix * NDim, gridShape.data(), 0);
pointIndex(ix) = index; pointIndex(ix) = index;
atomicAdd(numPointsPerGrid.data() + index, Index(1)); atomicAdd(numPointsPerGrid.data() + index, Index(1));
#pragma unroll #pragma unroll
for (int k = 0; k != numFeatures; ++k) { for (int k = 0; k != numFeatures; ++k) {
atomicAdd(grids.data() + index * numFeatures + k, *(points.data() + ix * numFeatures + k)); atomicAdd(grids.data() + index * numFeatures + k,
*(points.data() + ix * numFeatures + k));
} }
} }
} }
template <typename Index, unsigned NDim> template <typename Index, unsigned NDim>
__global__ void gatherPointFromGridKernel( __global__ void
tv::TensorView<const float> grids, gatherPointFromGridKernel(tv::TensorView<const float> grids,
tv::TensorView<const Index> numPointsPerGrid, tv::TensorView<const Index> numPointsPerGrid,
tv::TensorView<const Index> pointIndexUnique, tv::TensorView<const Index> pointIndexUnique,
tv::TensorView<float> voxels, tv::TensorView<float> voxels,
tv::TensorView<Index> coors, tv::TensorView<Index> coors,
const tv::SimpleVector<Index, NDim> gridShape) { const tv::SimpleVector<Index, NDim> gridShape) {
Index index; Index index;
int numVoxels = voxels.dim(0); int numVoxels = voxels.dim(0);
int numFeatures = grids.dim(1); int numFeatures = grids.dim(1);
...@@ -47,16 +46,15 @@ __global__ void gatherPointFromGridKernel( ...@@ -47,16 +46,15 @@ __global__ void gatherPointFromGridKernel(
for (int k = 0; k != numFeatures; ++k) { for (int k = 0; k != numFeatures; ++k) {
voxels(ix, k) = grids(index, k) / numPointsPerGrid(index); voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
} }
index = tv::rowArrayIdxInv<Index, NDim>( index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
index, coors.data() + ix * NDim, gridShape.data()); gridShape.data());
} }
} }
template <typename Index> template <typename Index>
__global__ void resetGridKernel( __global__ void resetGridKernel(tv::TensorView<float> grids,
tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> numPointsPerGrid, tv::TensorView<Index> pointIndexUnique) {
tv::TensorView<Index> pointIndexUnique) {
Index index; Index index;
int numVoxels = pointIndexUnique.dim(0) - 1; int numVoxels = pointIndexUnique.dim(0) - 1;
int numFeatures = grids.dim(1); int numFeatures = grids.dim(1);
...@@ -72,8 +70,8 @@ __global__ void resetGridKernel( ...@@ -72,8 +70,8 @@ __global__ void resetGridKernel(
} }
template <typename Index> template <typename Index>
__global__ void resetPointIndexKernel( __global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
tv::TensorView<Index> pointIndex, const Index gridVolume) { const Index gridVolume) {
int num_max_points = pointIndex.dim(0) - 1; int num_max_points = pointIndex.dim(0) - 1;
for (int ix : tv::KernelLoopX<int>(num_max_points)) { for (int ix : tv::KernelLoopX<int>(num_max_points)) {
......
...@@ -21,15 +21,10 @@ ...@@ -21,15 +21,10 @@
namespace spconv { namespace spconv {
int64_t int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
pointsToVoxel(torch::Tensor points, torch::Tensor pointIndex, torch::Tensor grids,
torch::Tensor indexes, torch::Tensor numPointsPerGrid, torch::Tensor voxels,
torch::Tensor pointIndex, torch::Tensor coors, std::vector<int64_t> gridShape,
torch::Tensor grids, const int64_t ndim);
torch::Tensor numPointsPerGrid,
torch::Tensor voxels,
torch::Tensor coors,
std::vector<int64_t> gridShape,
const int64_t ndim);
} // namespace spconv } // namespace spconv
...@@ -3,24 +3,20 @@ ...@@ -3,24 +3,20 @@
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <torch/script.h> #include <torch/script.h>
namespace spconv { namespace spconv {
void scatter_point_to_grid_cuda( void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
torch::Tensor points, torch::Tensor grids,
torch::Tensor indexes, torch::Tensor numPointsPerGrid,
torch::Tensor grids, torch::Tensor pointIndex,
torch::Tensor numPointsPerGrid, std::vector<int64_t> gridShape, const int ndim);
torch::Tensor pointIndex,
std::vector<int64_t> gridShape,
const int ndim);
void gather_point_from_grid_cuda( void gather_point_from_grid_cuda(torch::Tensor grids,
torch::Tensor grids, torch::Tensor numPointsPerGrid, torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex, torch::Tensor pointIndex,
torch::Tensor pointIndexUnique, torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors, torch::Tensor voxels, torch::Tensor coors,
std::vector<int64_t> gridShape, std::vector<int64_t> gridShape,
const int ndim); const int ndim);
} // namespace spconv } // namespace spconv
...@@ -23,9 +23,15 @@ ...@@ -23,9 +23,15 @@
namespace spconv { namespace spconv {
enum ConvAlgo { kNative = 0, kBatch, kBatchGemmGather, kSparseConvNet, kMinkowskiEngine }; enum ConvAlgo {
using all_conv_algos_t = kNative = 0,
tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather, kSparseConvNet, kMinkowskiEngine>; kBatch,
kBatchGemmGather,
kSparseConvNet,
kMinkowskiEngine
};
using all_conv_algos_t = tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather,
kSparseConvNet, kMinkowskiEngine>;
// torch.jit's doc says only support int64, so we need to convert to int32. // torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor> std::vector<torch::Tensor>
......
...@@ -7,9 +7,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) ...@@ -7,9 +7,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet) Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
Copyright (c) 2006 Idiap Research Institute (Samy Bengio) Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
Samy Bengio, Johnny Mariethoz)
From Caffe2: From Caffe2:
...@@ -17,23 +18,23 @@ Copyright (c) 2016-present, Facebook Inc. All rights reserved. ...@@ -17,23 +18,23 @@ Copyright (c) 2016-present, Facebook Inc. All rights reserved.
All contributions by Facebook: All contributions by Facebook:
Copyright (c) 2016 Facebook Inc. Copyright (c) 2016 Facebook Inc.
All contributions by Google: All contributions by Google:
Copyright (c) 2015 Google Inc. Copyright (c) 2015 Google Inc.
All rights reserved. All rights reserved.
All contributions by Yangqing Jia: All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia Copyright (c) 2015 Yangqing Jia
All rights reserved. All rights reserved.
All contributions from Caffe: All contributions from Caffe:
Copyright(c) 2013, 2014, 2015, the respective contributors Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved. All rights reserved.
All other contributions: All other contributions:
Copyright(c) 2015, 2016 the respective contributors Copyright(c) 2015, 2016 the respective contributors
All rights reserved. All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds Caffe2 uses a copyright model similar to Caffe: each contributor holds
copyright over their contributions to Caffe2. The project versioning records copyright over their contributions to Caffe2. The project versioning records
all such contribution and copyright details. If a contributor wants to further all such contribution and copyright details. If a contributor wants to further
...@@ -53,8 +54,8 @@ modification, are permitted provided that the following conditions are met: ...@@ -53,8 +54,8 @@ modification, are permitted provided that the following conditions are met:
notice, this list of conditions and the following disclaimer in the notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution. documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
and IDIAP Research Institute nor the names of its contributors may be America and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without used to endorse or promote products derived from this software without
specific prior written permission. specific prior written permission.
...@@ -75,7 +76,7 @@ POSSIBILITY OF SUCH DAMAGE. ...@@ -75,7 +76,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include <utility> #include <utility>
namespace tv { namespace tv {
#ifdef __cpp_lib_void_t #ifdef __cpp_lib_void_t
template <class T> using void_t = std::void_t<T>; template <class T> using void_t = std::void_t<T>;
...@@ -97,47 +98,67 @@ struct _identity final { ...@@ -97,47 +98,67 @@ struct _identity final {
return std::forward<T>(arg); return std::forward<T>(arg);
} }
}; };
template<class Func, class Enable = void> template <class Func, class Enable = void>
struct function_takes_identity_argument : std::false_type {}; struct function_takes_identity_argument : std::false_type {};
#if defined(_MSC_VER) #if defined(_MSC_VER)
// For some weird reason, MSVC shows a compiler error when using guts::void_t instead of std::void_t. // For some weird reason, MSVC shows a compiler error when using guts::void_t
// But we're only building on MSVC versions that have std::void_t, so let's just use that one. // instead of std::void_t. But we're only building on MSVC versions that have
template<class Func> // std::void_t, so let's just use that one.
struct function_takes_identity_argument<Func, std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {}; template <class Func>
struct function_takes_identity_argument<
Func, std::void_t<decltype(std::declval<Func>()(_identity()))>>
: std::true_type {};
#else #else
template<class Func> template <class Func>
struct function_takes_identity_argument<Func, void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {}; struct function_takes_identity_argument<
Func, void_t<decltype(std::declval<Func>()(_identity()))>>
: std::true_type {};
#endif #endif
template<bool Condition> template <bool Condition> struct _if_constexpr;
struct _if_constexpr;
template<> template <> struct _if_constexpr<true> final {
struct _if_constexpr<true> final { template <
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ThenCallback>::value, void*> = nullptr> class ThenCallback, class ElseCallback,
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) { std::enable_if_t<function_takes_identity_argument<ThenCallback>::value,
// The _identity instance passed in can be used to delay evaluation of an expression, void *> = nullptr>
// because the compiler can't know that it's just the identity we're passing in. static decltype(auto) call(ThenCallback &&thenCallback,
ElseCallback && /* elseCallback */) {
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return thenCallback(_identity()); return thenCallback(_identity());
} }
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value, void*> = nullptr> template <
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) { class ThenCallback, class ElseCallback,
std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback &&thenCallback,
ElseCallback && /* elseCallback */) {
return thenCallback(); return thenCallback();
} }
}; };
template<> template <> struct _if_constexpr<false> final {
struct _if_constexpr<false> final { template <
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ElseCallback>::value, void*> = nullptr> class ThenCallback, class ElseCallback,
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) { std::enable_if_t<function_takes_identity_argument<ElseCallback>::value,
// The _identity instance passed in can be used to delay evaluation of an expression, void *> = nullptr>
// because the compiler can't know that it's just the identity we're passing in. static decltype(auto) call(ThenCallback && /* thenCallback */,
ElseCallback &&elseCallback) {
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return elseCallback(_identity()); return elseCallback(_identity());
} }
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value, void*> = nullptr> template <
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) { class ThenCallback, class ElseCallback,
std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback && /* thenCallback */,
ElseCallback &&elseCallback) {
return elseCallback(); return elseCallback();
} }
}; };
...@@ -173,33 +194,40 @@ struct _if_constexpr<false> final { ...@@ -173,33 +194,40 @@ struct _if_constexpr<false> final {
* template <class T> * template <class T>
* int func(T t) { * int func(T t) {
* return if_constexpr<std::is_same<T, MyClass1>::value>( * return if_constexpr<std::is_same<T, MyClass1>::value>(
* [&](auto _) { return _(t).value; }, // this code is invalid for T == MyClass2, so a regular non-constexpr if statement wouldn't compile * [&](auto _) { return _(t).value; }, // this code is invalid for T ==
* [&](auto _) { return _(t).val; } // this code is invalid for T == MyClass1 * MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T ==
* MyClass1
* ); * );
* } * }
* *
* Note: The _ argument passed in Example 3 is the identity function, i.e. it does nothing. * Note: The _ argument passed in Example 3 is the identity function, i.e. it
* It is used to force the compiler to delay type checking, because the compiler * does nothing. It is used to force the compiler to delay type checking,
* doesn't know what kind of _ is passed in. Without it, the compiler would fail * because the compiler doesn't know what kind of _ is passed in. Without it,
* when you try to access t.value but the member doesn't exist. * the compiler would fail when you try to access t.value but the member doesn't
* exist.
* *
* Note: In Example 3, both branches return int, so func() returns int. This is not necessary. * Note: In Example 3, both branches return int, so func() returns int. This is
* If func() had a return type of "auto", then both branches could return different * not necessary. If func() had a return type of "auto", then both branches
* types, say func<MyClass1>() could return int and func<MyClass2>() could return string. * could return different types, say func<MyClass1>() could return int and
* func<MyClass2>() could return string.
*/ */
template<bool Condition, class ThenCallback, class ElseCallback> template <bool Condition, class ThenCallback, class ElseCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) { decltype(auto) if_constexpr(ThenCallback &&thenCallback,
ElseCallback &&elseCallback) {
#if defined(__cpp_if_constexpr) #if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it. // If we have C++17, just use it's "if constexpr" feature instead of wrapping
// This will give us better error messages. // it. This will give us better error messages.
if constexpr(Condition) { if constexpr (Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) { if constexpr (detail::function_takes_identity_argument<
ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity()); return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else { } else {
return std::forward<ThenCallback>(thenCallback)(); return std::forward<ThenCallback>(thenCallback)();
} }
} else { } else {
if constexpr (detail::function_takes_identity_argument<ElseCallback>::value) { if constexpr (detail::function_takes_identity_argument<
ElseCallback>::value) {
return std::forward<ElseCallback>(elseCallback)(detail::_identity()); return std::forward<ElseCallback>(elseCallback)(detail::_identity());
} else { } else {
return std::forward<ElseCallback>(elseCallback)(); return std::forward<ElseCallback>(elseCallback)();
...@@ -207,18 +235,20 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCall ...@@ -207,18 +235,20 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCall
} }
#else #else
// C++14 implementation of if constexpr // C++14 implementation of if constexpr
return detail::_if_constexpr<Condition>::call(std::forward<ThenCallback>(thenCallback), return detail::_if_constexpr<Condition>::call(
std::forward<ElseCallback>(elseCallback)); std::forward<ThenCallback>(thenCallback),
std::forward<ElseCallback>(elseCallback));
#endif #endif
} }
template<bool Condition, class ThenCallback> template <bool Condition, class ThenCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback) { decltype(auto) if_constexpr(ThenCallback &&thenCallback) {
#if defined(__cpp_if_constexpr) #if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it. // If we have C++17, just use it's "if constexpr" feature instead of wrapping
// This will give us better error messages. // it. This will give us better error messages.
if constexpr(Condition) { if constexpr (Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) { if constexpr (detail::function_takes_identity_argument<
ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity()); return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else { } else {
return std::forward<ThenCallback>(thenCallback)(); return std::forward<ThenCallback>(thenCallback)();
...@@ -226,9 +256,9 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback) { ...@@ -226,9 +256,9 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
} }
#else #else
// C++14 implementation of if constexpr // C++14 implementation of if constexpr
return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback), [] (auto) {}); return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback),
[](auto) {});
#endif #endif
} }
} // namespace tv
}
...@@ -22,13 +22,13 @@ If you can use libtorch, dont use tv::Tensor. ...@@ -22,13 +22,13 @@ If you can use libtorch, dont use tv::Tensor.
*/ */
#pragma once #pragma once
#include "cc17.h"
#include "mp_helper.h" #include "mp_helper.h"
#include "tensorview.h" #include "tensorview.h"
#include <cstring> #include <cstring>
#include <iomanip> #include <iomanip>
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include "cc17.h"
#ifdef TV_CUDA #ifdef TV_CUDA
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -632,26 +632,31 @@ struct Tensor { ...@@ -632,26 +632,31 @@ struct Tensor {
tview() const { tview() const {
static_assert(Rank == -1 || Rank > 0, "error"); static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error"); TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return if_constexpr<(Rank > 0)>([&](auto _){ return if_constexpr<(Rank > 0)>(
TV_ASSERT_RT_ERR(Rank == ndim(), "error"); [&](auto _) {
ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank); TV_ASSERT_RT_ERR(Rank == ndim(), "error");
for (int i = 0; i < Rank; ++i) { ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank),
shape[i] = shape_[i]; stride(Rank);
stride[i] = stride_[i]; for (int i = 0; i < Rank; ++i) {
} shape[i] = shape_[i];
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>( stride[i] = stride_[i];
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape, }
stride); return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
}, [&](auto _){ Tindex>(
ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim()); reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
for (int i = 0; i < int(ndim()); ++i) { shape, stride);
shape[i] = shape_[i]; },
stride[i] = stride_[i]; [&](auto _) {
} ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>( for (int i = 0; i < int(ndim()); ++i) {
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape, shape[i] = shape_[i];
stride); stride[i] = stride_[i];
}); }
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
shape, stride);
});
} }
template <class... Inds> Tensor view(Inds... newShapes) const { template <class... Inds> Tensor view(Inds... newShapes) const {
static_assert(sizeof...(newShapes) > 0, "dont support empty for now"); static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
......
...@@ -36,22 +36,21 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer { ...@@ -36,22 +36,21 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
return res; return res;
} }
template <int Count, typename F> template <int Count, typename F>
double benchmark(F&& f, int start=int(Count) * 0.3){ double benchmark(F &&f, int start = int(Count) * 0.3) {
// std::vector<TimeT::rep> times; // std::vector<TimeT::rep> times;
auto res = typename TimeT::rep(); auto res = typename TimeT::rep();
int count = 0; int count = 0;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
for (int i = 0; i < Count; ++i){ for (int i = 0; i < Count; ++i) {
std::forward<F>(f)(); std::forward<F>(f)();
auto time = report(); auto time = report();
if (i >= start){ if (i >= start) {
// times.push_back(time) // times.push_back(time)
res += time; res += time;
count += 1; count += 1;
} }
} }
return res / double(count); return res / double(count);
} }
private: private:
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#pragma once #pragma once
#include "mp_helper.h" #include "mp_helper.h"
#include <tensorview/tensorview.h>
#include <tensorview/tensor.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <torch/script.h> #include <torch/script.h>
#ifdef TV_CUDA #ifdef TV_CUDA
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
......
...@@ -22,6 +22,7 @@ from spconv import ops, utils ...@@ -22,6 +22,7 @@ from spconv import ops, utils
from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d, SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d) SparseInverseConv3d, SubMConv2d, SubMConv3d)
from spconv.core import SparseConvTensor
from spconv.identity import Identity from spconv.identity import Identity
from spconv.modules import SparseModule, SparseSequential from spconv.modules import SparseModule, SparseSequential
from spconv.ops import ConvAlgo from spconv.ops import ConvAlgo
...@@ -35,85 +36,6 @@ _LIB_PATH = str(Path(__file__).parent / _LIB_FILE_NAME) ...@@ -35,85 +36,6 @@ _LIB_PATH = str(Path(__file__).parent / _LIB_FILE_NAME)
torch.ops.load_library(_LIB_PATH) torch.ops.load_library(_LIB_PATH)
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self, features, indices, spatial_shape, batch_size,
grid=None):
"""
Args:
features: [num_points, num_features] feature tensor
indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
spatial_shape: spatial shape of your sparse data
batch_size: batch size of your sparse data
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
"""
self.features = features
self.indices = indices
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid
@classmethod
def from_dense(cls, x: torch.Tensor):
"""create sparse tensor fron channel last dense tensor by to_sparse
x must be NHWC tensor, channel last
"""
x = x.to_sparse(x.ndim - 1)
spatial_shape = x.shape[1:-1]
batch_size = x.shape[0]
indices_th = x.indices().permute(1, 0).contiguous().int()
features_th = x.values()
return cls(features_th, indices_th, spatial_shape, batch_size)
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key):
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(
self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return self.indices.shape[0] / np.prod(
self.spatial_shape) / self.batch_size
class ToDense(SparseModule): class ToDense(SparseModule):
"""convert SparseConvTensor to NCHW dense tensor. """convert SparseConvTensor to NCHW dense tensor.
""" """
......
...@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter ...@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
import spconv import spconv
import spconv.functional as Fsp import spconv.functional as Fsp
from spconv import ops from spconv import ops
from spconv.core import IndiceData, SparseConvTensor
from spconv.modules import SparseModule from spconv.modules import SparseModule
...@@ -72,8 +73,9 @@ class SparseConvolution(SparseModule): ...@@ -72,8 +73,9 @@ class SparseConvolution(SparseModule):
indice_key=None, indice_key=None,
fused_bn=False, fused_bn=False,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
super(SparseConvolution, self).__init__() name=None):
super(SparseConvolution, self).__init__(name=name)
assert groups == 1 assert groups == 1
if not isinstance(kernel_size, (list, tuple)): if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim kernel_size = [kernel_size] * ndim
...@@ -123,8 +125,8 @@ class SparseConvolution(SparseModule): ...@@ -123,8 +125,8 @@ class SparseConvolution(SparseModule):
bound = 1 / math.sqrt(fan_in) bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound) init.uniform_(self.bias, -bound, bound)
def forward(self, input): def forward(self, input: SparseConvTensor):
assert isinstance(input, spconv.SparseConvTensor) assert isinstance(input, SparseConvTensor)
features = input.features features = input.features
device = features.device device = features.device
indices = input.indices indices = input.indices
...@@ -143,29 +145,58 @@ class SparseConvolution(SparseModule): ...@@ -143,29 +145,58 @@ class SparseConvolution(SparseModule):
out_spatial_shape = spatial_shape out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape) # input.update_grid(out_spatial_shape)
# t = time.time() # t = time.time()
out_tensor = input.shadow_copy()
if input.benchmark:
if self.name is None:
raise ValueError(
"you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
)
if self.name not in input.benchmark_record:
input.benchmark_record[self.name] = {
"type": "SparseConvolution",
"indice_gen_time": [],
"time": [],
"num_points": [],
"num_out_points": [],
"params": {
"kernel_size": self.kernel_size,
"stride": self.stride,
"padding": self.padding,
"dilation": self.dilation,
"output_padding": self.output_padding,
"subm": self.subm,
"transposed": self.transposed,
"input_channels": self.in_channels,
"out_channels": self.out_channels,
}
}
if self.conv1x1: if self.conv1x1:
features = torch.mm( features = torch.mm(
input.features, input.features,
self.weight.view(self.in_channels, self.out_channels)) self.weight.view(self.in_channels, self.out_channels))
if self.bias is not None: if self.bias is not None:
features += self.bias features += self.bias
out_tensor = spconv.SparseConvTensor(features, input.indices, out_tensor.features = features
input.spatial_shape,
input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor return out_tensor
datas = input.find_indice_pair(self.indice_key) datas = input.find_indice_pair(self.indice_key)
if self.inverse: if self.inverse:
assert datas is not None and self.indice_key is not None assert datas is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas outids = datas.indices
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
out_spatial_shape = datas.out_spatial_shape
assert indice_pair_num.shape[0] == np.prod( assert indice_pair_num.shape[0] == np.prod(
self.kernel_size self.kernel_size
), "inverse conv must have same kernel size as its couple conv" ), "inverse conv must have same kernel size as its couple conv"
else: else:
if self.indice_key is not None and datas is not None: if self.indice_key is not None and datas is not None:
outids, _, indice_pairs, indice_pair_num, _ = datas outids = datas.out_indices
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
else: else:
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs( outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices, indices,
batch_size, batch_size,
...@@ -179,10 +210,19 @@ class SparseConvolution(SparseModule): ...@@ -179,10 +210,19 @@ class SparseConvolution(SparseModule):
self.transposed, self.transposed,
grid=input.grid, grid=input.grid,
use_hash=self.use_hash) use_hash=self.use_hash)
input.indice_dict[self.indice_key] = (outids, indices, if input.benchmark:
indice_pairs, torch.cuda.synchronize()
indice_pair_num, interval = time.time() - t
spatial_shape) out_tensor.benchmark_record[
self.name]["indice_gen_time"].append(interval)
indice_data = IndiceData(outids, indices, indice_pairs,
indice_pair_num, spatial_shape)
input.indice_dict[self.indice_key] = indice_data
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
if self.fused_bn: if self.fused_bn:
assert self.bias is not None assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight, out_features = ops.fused_indice_conv(features, self.weight,
...@@ -210,10 +250,18 @@ class SparseConvolution(SparseModule): ...@@ -210,10 +250,18 @@ class SparseConvolution(SparseModule):
if self.bias is not None: if self.bias is not None:
out_features += self.bias out_features += self.bias
out_tensor = spconv.SparseConvTensor(out_features, outids, if input.benchmark:
out_spatial_shape, batch_size) torch.cuda.synchronize()
out_tensor.indice_dict = input.indice_dict interval = time.time() - t
out_tensor.grid = input.grid out_tensor.benchmark_record[self.name]["time"].append(interval)
out_tensor.benchmark_record[self.name]["num_points"].append(
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
out_tensor.features = out_features
out_tensor.indices = outids
out_tensor.spatial_shape = out_spatial_shape
return out_tensor return out_tensor
...@@ -229,7 +277,8 @@ class SparseConv2d(SparseConvolution): ...@@ -229,7 +277,8 @@ class SparseConv2d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv2d, self).__init__(2, super(SparseConv2d, self).__init__(2,
in_channels, in_channels,
out_channels, out_channels,
...@@ -241,7 +290,8 @@ class SparseConv2d(SparseConvolution): ...@@ -241,7 +290,8 @@ class SparseConv2d(SparseConvolution):
bias, bias,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SparseConv3d(SparseConvolution): class SparseConv3d(SparseConvolution):
...@@ -256,7 +306,8 @@ class SparseConv3d(SparseConvolution): ...@@ -256,7 +306,8 @@ class SparseConv3d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv3d, self).__init__(3, super(SparseConv3d, self).__init__(3,
in_channels, in_channels,
out_channels, out_channels,
...@@ -268,7 +319,8 @@ class SparseConv3d(SparseConvolution): ...@@ -268,7 +319,8 @@ class SparseConv3d(SparseConvolution):
bias, bias,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SparseConv4d(SparseConvolution): class SparseConv4d(SparseConvolution):
...@@ -283,7 +335,8 @@ class SparseConv4d(SparseConvolution): ...@@ -283,7 +335,8 @@ class SparseConv4d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv4d, self).__init__(4, super(SparseConv4d, self).__init__(4,
in_channels, in_channels,
out_channels, out_channels,
...@@ -295,7 +348,8 @@ class SparseConv4d(SparseConvolution): ...@@ -295,7 +348,8 @@ class SparseConv4d(SparseConvolution):
bias, bias,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SparseConvTranspose2d(SparseConvolution): class SparseConvTranspose2d(SparseConvolution):
...@@ -310,7 +364,8 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -310,7 +364,8 @@ class SparseConvTranspose2d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose2d, self).__init__(2, super(SparseConvTranspose2d, self).__init__(2,
in_channels, in_channels,
out_channels, out_channels,
...@@ -323,7 +378,8 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -323,7 +378,8 @@ class SparseConvTranspose2d(SparseConvolution):
transposed=True, transposed=True,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SparseConvTranspose3d(SparseConvolution): class SparseConvTranspose3d(SparseConvolution):
...@@ -338,7 +394,8 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -338,7 +394,8 @@ class SparseConvTranspose3d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose3d, self).__init__(3, super(SparseConvTranspose3d, self).__init__(3,
in_channels, in_channels,
out_channels, out_channels,
...@@ -351,7 +408,8 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -351,7 +408,8 @@ class SparseConvTranspose3d(SparseConvolution):
transposed=True, transposed=True,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SparseInverseConv2d(SparseConvolution): class SparseInverseConv2d(SparseConvolution):
...@@ -361,7 +419,8 @@ class SparseInverseConv2d(SparseConvolution): ...@@ -361,7 +419,8 @@ class SparseInverseConv2d(SparseConvolution):
kernel_size, kernel_size,
indice_key, indice_key,
bias=True, bias=True,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv2d, self).__init__(2, super(SparseInverseConv2d, self).__init__(2,
in_channels, in_channels,
out_channels, out_channels,
...@@ -369,7 +428,8 @@ class SparseInverseConv2d(SparseConvolution): ...@@ -369,7 +428,8 @@ class SparseInverseConv2d(SparseConvolution):
bias=bias, bias=bias,
inverse=True, inverse=True,
indice_key=indice_key, indice_key=indice_key,
algo=algo) algo=algo,
name=name)
class SparseInverseConv3d(SparseConvolution): class SparseInverseConv3d(SparseConvolution):
...@@ -379,7 +439,8 @@ class SparseInverseConv3d(SparseConvolution): ...@@ -379,7 +439,8 @@ class SparseInverseConv3d(SparseConvolution):
kernel_size, kernel_size,
indice_key, indice_key,
bias=True, bias=True,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv3d, self).__init__(3, super(SparseInverseConv3d, self).__init__(3,
in_channels, in_channels,
out_channels, out_channels,
...@@ -387,7 +448,8 @@ class SparseInverseConv3d(SparseConvolution): ...@@ -387,7 +448,8 @@ class SparseInverseConv3d(SparseConvolution):
bias=bias, bias=bias,
inverse=True, inverse=True,
indice_key=indice_key, indice_key=indice_key,
algo=algo) algo=algo,
name=name)
class SubMConv2d(SparseConvolution): class SubMConv2d(SparseConvolution):
...@@ -402,7 +464,8 @@ class SubMConv2d(SparseConvolution): ...@@ -402,7 +464,8 @@ class SubMConv2d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv2d, self).__init__(2, super(SubMConv2d, self).__init__(2,
in_channels, in_channels,
out_channels, out_channels,
...@@ -415,7 +478,8 @@ class SubMConv2d(SparseConvolution): ...@@ -415,7 +478,8 @@ class SubMConv2d(SparseConvolution):
True, True,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SubMConv3d(SparseConvolution): class SubMConv3d(SparseConvolution):
...@@ -430,7 +494,8 @@ class SubMConv3d(SparseConvolution): ...@@ -430,7 +494,8 @@ class SubMConv3d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv3d, self).__init__(3, super(SubMConv3d, self).__init__(3,
in_channels, in_channels,
out_channels, out_channels,
...@@ -443,7 +508,8 @@ class SubMConv3d(SparseConvolution): ...@@ -443,7 +508,8 @@ class SubMConv3d(SparseConvolution):
True, True,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
class SubMConv4d(SparseConvolution): class SubMConv4d(SparseConvolution):
...@@ -458,7 +524,8 @@ class SubMConv4d(SparseConvolution): ...@@ -458,7 +524,8 @@ class SubMConv4d(SparseConvolution):
bias=True, bias=True,
indice_key=None, indice_key=None,
use_hash=False, use_hash=False,
algo=ops.ConvAlgo.Native): algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv4d, self).__init__(4, super(SubMConv4d, self).__init__(4,
in_channels, in_channels,
out_channels, out_channels,
...@@ -471,4 +538,5 @@ class SubMConv4d(SparseConvolution): ...@@ -471,4 +538,5 @@ class SubMConv4d(SparseConvolution):
True, True,
indice_key=indice_key, indice_key=indice_key,
use_hash=use_hash, use_hash=use_hash,
algo=algo) algo=algo,
name=name)
from typing import Optional
import numpy as np
import torch
class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
out_spatial_shape):
self.out_indices = out_indices
self.indices = indices
self.indice_pairs = indice_pairs
self.indice_pair_num = indice_pair_num
self.out_spatial_shape = out_spatial_shape
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None,
benchmark=False):
"""
Args:
features: [num_points, num_features] feature tensor
indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
spatial_shape: spatial shape of your sparse data
batch_size: batch size of your sparse data
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
"""
self.features = features
self.indices = indices
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid
self.benchmark = benchmark
self.benchmark_record = {}
@classmethod
def from_dense(cls, x: torch.Tensor):
"""create sparse tensor fron channel last dense tensor by to_sparse
x must be NHWC tensor, channel last
"""
x = x.to_sparse(x.ndim - 1)
spatial_shape = x.shape[1:-1]
batch_size = x.shape[0]
indices_th = x.indices().permute(1, 0).contiguous().int()
features_th = x.values()
return cls(features_th, indices_th, spatial_shape, batch_size)
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key) -> Optional[IndiceData]:
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(
self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return self.indices.shape[0] / np.prod(
self.spatial_shape) / self.batch_size
def shadow_copy(self) -> "SparseConvTensor":
"""create a new spconv tensor with all member unchanged"""
tensor = SparseConvTensor(self.features, self.indices,
self.spatial_shape, self.batch_size,
self.grid, self.benchmark)
tensor.benchmark_record = self.benchmark_record
tensor.indice_dict = self.indice_dict
return tensor
...@@ -49,7 +49,9 @@ def _mean_update(vals, m_vals, t): ...@@ -49,7 +49,9 @@ def _mean_update(vals, m_vals, t):
class SparseModule(nn.Module): class SparseModule(nn.Module):
""" place holder, all module subclass from this will take sptensor in SparseSequential. """ place holder, all module subclass from this will take sptensor in SparseSequential.
""" """
pass def __init__(self, name=None):
super().__init__()
self.name = name
class SparseSequential(SparseModule): class SparseSequential(SparseModule):
......
...@@ -24,7 +24,8 @@ class ConvAlgo(Enum): ...@@ -24,7 +24,8 @@ class ConvAlgo(Enum):
Batch = 1 # high memory cost, faster when number of points is small (< 50000) Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium BatchGemmGather = 2 # high memory cost, faster when number of points medium
SparseConvNet = 3 SparseConvNet = 3
Minkowski = 4 # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu Minkowski = 4 # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation): def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size) ndim = len(input_size)
......
...@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter ...@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
import spconv import spconv
import spconv.functional as Fsp import spconv.functional as Fsp
from spconv import ops from spconv import ops
from spconv.core import IndiceData
from spconv.modules import SparseModule from spconv.modules import SparseModule
...@@ -34,8 +35,10 @@ class SparseMaxPool(SparseModule): ...@@ -34,8 +35,10 @@ class SparseMaxPool(SparseModule):
stride=None, stride=None,
padding=0, padding=0,
dilation=1, dilation=1,
subm=False): indice_key=None,
super(SparseMaxPool, self).__init__() subm=False,
name=None):
super(SparseMaxPool, self).__init__(name=name)
if not isinstance(kernel_size, (list, tuple)): if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim kernel_size = [kernel_size] * ndim
if stride is None: if stride is None:
...@@ -52,6 +55,7 @@ class SparseMaxPool(SparseModule): ...@@ -52,6 +55,7 @@ class SparseMaxPool(SparseModule):
self.padding = padding self.padding = padding
self.subm = subm self.subm = subm
self.dilation = dilation self.dilation = dilation
self.indice_key = indice_key
def forward(self, input): def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor) assert isinstance(input, spconv.SparseConvTensor)
...@@ -66,6 +70,32 @@ class SparseMaxPool(SparseModule): ...@@ -66,6 +70,32 @@ class SparseMaxPool(SparseModule):
self.dilation) self.dilation)
else: else:
out_spatial_shape = spatial_shape out_spatial_shape = spatial_shape
out_tensor = input.shadow_copy()
if input.benchmark:
if self.name is None:
raise ValueError(
"you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
)
if self.name not in input.benchmark_record:
input.benchmark_record[self.name] = {
"type": "SparseMaxPool",
"indice_gen_time": [],
"time": [],
"num_points": [],
"num_out_points": [],
"params": {
"kernel_size": self.kernel_size,
"stride": self.stride,
"padding": self.padding,
"dilation": self.dilation,
"channels": features.shape[1],
}
}
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs( outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices, indices,
batch_size, batch_size,
...@@ -77,24 +107,65 @@ class SparseMaxPool(SparseModule): ...@@ -77,24 +107,65 @@ class SparseMaxPool(SparseModule):
0, 0,
self.subm, self.subm,
grid=input.grid) grid=input.grid)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["indice_gen_time"].append(
interval)
t = time.time()
if self.indice_key is not None:
datas = input.find_indice_pair(self.indice_key)
if datas is None:
indice_data = IndiceData(outids, indices, indice_pairs,
indice_pairs_num, spatial_shape)
input.indice_dict[self.indice_key] = indice_data
else:
raise ValueError("indice data exists")
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device), out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device), indice_pairs_num.to(device),
outids.shape[0]) outids.shape[0])
out_tensor = spconv.SparseConvTensor(out_features, outids, if input.benchmark:
out_spatial_shape, batch_size) torch.cuda.synchronize()
out_tensor.indice_dict = input.indice_dict interval = time.time() - t
out_tensor.grid = input.grid out_tensor.benchmark_record[self.name]["time"].append(interval)
out_tensor.benchmark_record[self.name]["num_points"].append(
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
out_tensor.features = out_features
out_tensor.indices = outids
out_tensor.spatial_shape = out_spatial_shape
return out_tensor return out_tensor
class SparseMaxPool2d(SparseMaxPool): class SparseMaxPool2d(SparseMaxPool):
def __init__(self, kernel_size, stride=None, padding=0, dilation=1): def __init__(self,
super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding, kernel_size,
dilation) stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool2d, self).__init__(2,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool3d(SparseMaxPool): class SparseMaxPool3d(SparseMaxPool):
def __init__(self, kernel_size, stride=None, padding=0, dilation=1): def __init__(self,
super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding, kernel_size,
dilation) stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool3d, self).__init__(3,
kernel_size,
stride,
padding,
dilation,
name=name)
...@@ -24,9 +24,10 @@ from torch.nn.parameter import Parameter ...@@ -24,9 +24,10 @@ from torch.nn.parameter import Parameter
import spconv import spconv
from spconv.modules import SparseModule from spconv.modules import SparseModule
class RemoveDuplicate(SparseModule): class RemoveDuplicate(SparseModule):
def forward(self, x: spconv.SparseConvTensor): def forward(self, x: spconv.SparseConvTensor):
inds = x.indices inds = x.indices
spatial_shape = [x.batch_size, *x.spatial_shape] spatial_shape = [x.batch_size, *x.spatial_shape]
spatial_stride = [0] * len(spatial_shape) spatial_stride = [0] * len(spatial_shape)
val = 1 val = 1
...@@ -39,5 +40,6 @@ class RemoveDuplicate(SparseModule): ...@@ -39,5 +40,6 @@ class RemoveDuplicate(SparseModule):
_, unique_inds = torch.unique(indices_index) _, unique_inds = torch.unique(indices_index)
new_inds = inds[unique_inds] new_inds = inds[unique_inds]
new_features = x.features[unique_inds] new_features = x.features[unique_inds]
res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape, x.batch_size, x.grid) res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape,
return res x.batch_size, x.grid)
\ No newline at end of file return res
...@@ -294,20 +294,18 @@ class VoxelGeneratorV2: ...@@ -294,20 +294,18 @@ class VoxelGeneratorV2:
def grid_size(self): def grid_size(self):
return self._grid_size return self._grid_size
class VoxelGeneratorV3: class VoxelGeneratorV3:
def __init__(self, def __init__(self, voxel_size, point_cloud_range, max_points, num_features,
voxel_size, dtype, device):
point_cloud_range,
max_points,
num_features,
dtype,
device):
self._max_points = max_points self._max_points = max_points
self._point_cloud_range = point_cloud_range self._point_cloud_range = point_cloud_range
self._voxel_size = voxel_size self._voxel_size = voxel_size
self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32) self._grid_size = torch.round(
(self._point_cloud_range[3:] - self._point_cloud_range[:3]) /
self._voxel_size).to(torch.int32)
grid_volume = self._grid_size.prod() grid_volume = self._grid_size.prod()
self._grid_size = self._grid_size.cpu().numpy().tolist() self._grid_size = self._grid_size.cpu().numpy().tolist()
self._ndim = len(self._grid_size) self._ndim = len(self._grid_size)
...@@ -315,19 +313,34 @@ class VoxelGeneratorV3: ...@@ -315,19 +313,34 @@ class VoxelGeneratorV3:
self._dtype = dtype self._dtype = dtype
self._device = device self._device = device
self._point_index = torch.full([max_points + 1], grid_volume, dtype=torch.int32, device=self._device) self._point_index = torch.full([max_points + 1],
self._grids = torch.zeros([grid_volume, num_features], dtype=self._dtype, device=self._device) grid_volume,
self._num_points_per_grid = torch.zeros([grid_volume], dtype=torch.int32, device=self._device) dtype=torch.int32,
self._voxels = torch.zeros([max_points, num_features], dtype=self._dtype, device=self._device) device=self._device)
self._coors = torch.zeros([max_points, self._ndim], dtype=torch.int32, device=self._device) self._grids = torch.zeros([grid_volume, num_features],
dtype=self._dtype,
device=self._device)
self._num_points_per_grid = torch.zeros([grid_volume],
dtype=torch.int32,
device=self._device)
self._voxels = torch.zeros([max_points, num_features],
dtype=self._dtype,
device=self._device)
self._coors = torch.zeros([max_points, self._ndim],
dtype=torch.int32,
device=self._device)
def generate(self, points): def generate(self, points):
assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0]) assert points.shape[
0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
points.shape[0])
points.to(self._dtype).to(self._device) points.to(self._dtype).to(self._device)
return self.points_to_voxel(points) return self.points_to_voxel(points)
def generate_multi_gpu(self, points): def generate_multi_gpu(self, points):
assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0]) assert points.shape[
0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
points.shape[0])
points.to(self._dtype).to(self._device) points.to(self._dtype).to(self._device)
return self.points_to_voxel(points) return self.points_to_voxel(points)
...@@ -351,23 +364,21 @@ class VoxelGeneratorV3: ...@@ -351,23 +364,21 @@ class VoxelGeneratorV3:
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range. coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax format: xyzxyz, minmax
""" """
indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32) indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) /
num_voxel = torch.ops.spconv.points_to_voxel(points, indexes, self._voxel_size).to(torch.int32)
self._point_index, num_voxel = torch.ops.spconv.points_to_voxel(
self._grids, points, indexes, self._point_index, self._grids,
self._num_points_per_grid, self._num_points_per_grid, self._voxels, self._coors,
self._voxels, self._grid_size, self._ndim)
self._coors,
self._grid_size,
self._ndim)
voxels = self._voxels[:num_voxel, :] voxels = self._voxels[:num_voxel, :]
coors = self._coors[:num_voxel, :] coors = self._coors[:num_voxel, :]
# xyz --> zyx # xyz --> zyx
#coors = coors[::-1] #coors = coors[::-1]
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1]) x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape(
[-1, 1]), coors[:, 2].reshape([-1, 1])
coors = torch.cat([z, y, x], dim=1) coors = torch.cat([z, y, x], dim=1)
# can be skipped # can be skipped
# x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:] # x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
# voxels = torch.cat([z, y, x, f], dim=1) # voxels = torch.cat([z, y, x, f], dim=1)
return voxels, coors return voxels, coors
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <spconv/fused_conv.cu.h> #include <spconv/fused_conv.cu.h>
#include <spconv/fused_conv.h> #include <spconv/fused_conv.h>
#include <tensorview/torch_utils.h>
#include <spconv/minkowski.cu.h> #include <spconv/minkowski.cu.h>
#include <tensorview/torch_utils.h>
namespace spconv { namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features, void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
...@@ -56,8 +56,8 @@ void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din, ...@@ -56,8 +56,8 @@ void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
} }
void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features, void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn, torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) { torch::Tensor indicesOut, int nHot) {
auto dtype = output.scalar_type(); auto dtype = output.scalar_type();
auto in_nchannel = features.size(1); auto in_nchannel = features.size(1);
auto out_nchannel = output.size(1); auto out_nchannel = output.size(1);
...@@ -81,10 +81,9 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features, ...@@ -81,10 +81,9 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
int step = (nHot + num_div - 1) / num_div; int step = (nHot + num_div - 1) / num_div;
dim3 threads(shared_mem_size, shared_mem_size); dim3 threads(shared_mem_size, shared_mem_size);
tv::dispatch_torch<float>(dtype, [&](auto I) { tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I); using T = decltype(I);
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue){ tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
constexpr int ShmemSize = decltype(ShSizeValue)::value; constexpr int ShmemSize = decltype(ShSizeValue)::value;
for (int s = 0; s < num_div; s++) { for (int s = 0; s < num_div; s++) {
int remainder = nHot - step * s; int remainder = nHot - step * s;
...@@ -93,17 +92,19 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features, ...@@ -93,17 +92,19 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
(curr_num_active + threads.y - 1) / threads.y); (curr_num_active + threads.y - 1) / threads.y);
matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>( matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
features.data_ptr<T>(), in_nchannel, curr_num_active, features.data_ptr<T>(), in_nchannel, curr_num_active,
filters.data_ptr<T>(), out_nchannel, filters.data_ptr<T>(), out_nchannel, in_nchannel,
in_nchannel, output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(), output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
indicesOut.data_ptr<int32_t>()); indicesOut.data_ptr<int32_t>());
} }
}); });
}); });
} }
void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor din, void fused_conv_backward_cuda_minkowski(torch::Tensor features,
torch::Tensor dout, torch::Tensor filters, torch::Tensor din, torch::Tensor dout,
torch::Tensor dfilters, torch::Tensor indicesIn, torch::Tensor filters,
torch::Tensor indicesOut, int nHot) { torch::Tensor dfilters,
torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = features.scalar_type(); auto dtype = features.scalar_type();
auto in_nchannel = features.size(1); auto in_nchannel = features.size(1);
auto out_nchannel = dout.size(1); auto out_nchannel = dout.size(1);
...@@ -131,7 +132,7 @@ void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor di ...@@ -131,7 +132,7 @@ void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor di
tv::dispatch_torch<float>(dtype, [&](auto I) { tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I); using T = decltype(I);
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue){ tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
constexpr int ShmemSize = decltype(ShSizeValue)::value; constexpr int ShmemSize = decltype(ShSizeValue)::value;
for (int s = 0; s < num_div; s++) { for (int s = 0; s < num_div; s++) {
int remainder = nHot - step * s; int remainder = nHot - step * s;
...@@ -141,10 +142,10 @@ void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor di ...@@ -141,10 +142,10 @@ void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor di
matmul2<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>( matmul2<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
dout.data_ptr<T>(), out_nchannel, curr_num_active, // A dout.data_ptr<T>(), out_nchannel, curr_num_active, // A
filters.data_ptr<T>(), out_nchannel, filters.data_ptr<T>(), out_nchannel,
in_nchannel, // B in_nchannel, // B
features.data_ptr<T>(), in_nchannel, curr_num_active, // D features.data_ptr<T>(), in_nchannel, curr_num_active, // D
din.data_ptr<T>(), // C din.data_ptr<T>(), // C
dfilters.data_ptr<T>(), // E dfilters.data_ptr<T>(), // E
indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>()); indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>());
} }
}); });
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment