"csrc/sm90/vscode:/vscode.git/clone" did not exist on "41b611f7d7561790a2f5040ff89212e08c7b0011"
Commit 6c767a51 authored by Yan Yan's avatar Yan Yan
Browse files

working on remove functor

parent 19e73bbe
...@@ -24,7 +24,6 @@ ...@@ -24,7 +24,6 @@
namespace spconv { namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32. // torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor torch::Tensor
fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
torch::Tensor bias, torch::Tensor indicePairs, torch::Tensor bias, torch::Tensor indicePairs,
...@@ -80,31 +79,17 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, ...@@ -80,31 +79,17 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
continue; continue;
} }
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(), auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(),
{nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
} }
#endif #endif
else { else {
...@@ -116,20 +101,11 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, ...@@ -116,20 +101,11 @@ fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
// totalGEMMTime += timer.report() / 1000.0; // totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse], nHot);
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse], nHot);
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
} }
#endif #endif
else { else {
......
...@@ -97,6 +97,21 @@ int create_submconv_indice_pair_cuda( ...@@ -97,6 +97,21 @@ int create_submconv_indice_pair_cuda(
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape, std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash); bool transpose, bool resetGrid, bool useHash);
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash);
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
} // namespace spconv } // namespace spconv
#endif #endif
\ No newline at end of file
...@@ -15,24 +15,19 @@ ...@@ -15,24 +15,19 @@
#ifndef SPARSE_REORDERING_FUNCTOR_H_ #ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_ #define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv { namespace spconv {
namespace functor { void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
template <typename Device, typename T, typename Index> torch::Tensor indices, int size);
struct SparseGatherFunctor { void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
void operator()(const Device &d, tv::TensorView<T> buffer, torch::Tensor indices, int size);
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size); void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
}; torch::Tensor indices, int size);
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
template <typename Device, typename T, typename Index>
struct SparseScatterAddFunctor {
void operator()(const Device &d, tv::TensorView<T> out_features,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size,
bool stable = false);
};
} // namespace functor
} // namespace spconv } // namespace spconv
#endif #endif
\ No newline at end of file
...@@ -198,6 +198,15 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -198,6 +198,15 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
} }
} }
std::vector<torch::Tensor>
getIndicePairV2(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash);
template <unsigned NDim> template <unsigned NDim>
std::vector<torch::Tensor> getIndicePairPreGrid( std::vector<torch::Tensor> getIndicePairPreGrid(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize, torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
...@@ -333,7 +342,6 @@ std::vector<torch::Tensor> getIndicePairPreGrid( ...@@ -333,7 +342,6 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM); int64_t numActOut, int64_t _inverse, int64_t _subM);
std::vector<torch::Tensor> std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters, indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor outGrad, torch::Tensor indicePairs,
......
...@@ -52,6 +52,10 @@ enum DType { ...@@ -52,6 +52,10 @@ enum DType {
namespace detail { namespace detail {
using dtype_collection_t =
tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
float16, int64, uint16, uint32, uint64>;
using all_tensor_types_t = using all_tensor_types_t =
std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t, std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
uint16_t, uint32_t, uint64_t, bool>; uint16_t, uint32_t, uint64_t, bool>;
...@@ -305,7 +309,7 @@ template <class... Ts, typename F> void dispatch(DType t, F &&f) { ...@@ -305,7 +309,7 @@ template <class... Ts, typename F> void dispatch(DType t, F &&f) {
static_assert(sizeof...(Ts) > 0, "you need to provide at least one type"); static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
bool notFound = true; bool notFound = true;
mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) { mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
if (type_v<decltype(I)> == t) { if (type_v<decltype(I)> == t && notFound) {
std::forward<F>(f)(decltype(I)()); std::forward<F>(f)(decltype(I)());
notFound = false; notFound = false;
} }
...@@ -325,7 +329,7 @@ template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) { ...@@ -325,7 +329,7 @@ template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
"you need to provide at least one candidate"); "you need to provide at least one candidate");
bool notFound = true; bool notFound = true;
mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) { mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
if (T(I) == idx) { if (T(I) == idx && notFound) {
std::forward<F>(f)(I); std::forward<F>(f)(I);
notFound = false; notFound = false;
} }
...@@ -343,7 +347,27 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) { ...@@ -343,7 +347,27 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
"you need to provide at least one candidate"); "you need to provide at least one candidate");
bool notFound = true; bool notFound = true;
mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) { mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
if (int(I) == idx) { if (decltype(I)::value == idx && notFound) {
std::forward<F>(f)(I);
notFound = false;
}
});
if (notFound) {
std::stringstream ss;
mp_for_each<mp_list_c<int, Is...>>(
[=, &ss](auto I) { ss << decltype(I)::value << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
}
}
template <int... Is, typename F, class BinaryPredicate>
void dispatch_int(int idx, BinaryPredicate p, F &&f) {
// BinaryPredicate: BinaryPredicate(idx, candidate)
static_assert(sizeof...(Is) > 0,
"you need to provide at least one candidate");
bool notFound = true;
mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
if (p(idx, decltype(I)::value) && notFound) {
std::forward<F>(f)(I); std::forward<F>(f)(I);
notFound = false; notFound = false;
} }
...@@ -351,7 +375,7 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) { ...@@ -351,7 +375,7 @@ template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
if (notFound) { if (notFound) {
std::stringstream ss; std::stringstream ss;
mp_for_each<mp_list_c<int, Is...>>( mp_for_each<mp_list_c<int, Is...>>(
[=, &ss](auto I) { ss << int(I) << " "; }); [=, &ss](auto I) { ss << decltype(I)::value << " "; });
TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str()); TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
} }
} }
...@@ -373,12 +397,16 @@ struct Dispatch<T<Args...>> { ...@@ -373,12 +397,16 @@ struct Dispatch<T<Args...>> {
template <class T> struct DispatchInt; template <class T> struct DispatchInt;
template <template <int...> class T, int... Ints> template <template<class...> class Tin, template<class, int> class T, int... Ints>
struct DispatchInt<T<Ints...>> { struct DispatchInt<Tin<T<int, Ints>...>> {
template <typename F> inline void operator()(int t, F &&f) { template <typename F> inline void operator()(int t, F &&f) {
return dispatch_int<Ints...>(t, std::forward<F>(f)); return dispatch_int<Ints...>(t, std::forward<F>(f));
} }
template <typename F, typename BinaryPredicate> inline void operator()(int t, BinaryPredicate p, F &&f) {
return dispatch_int<Ints...>(t, p, std::forward<F>(f));
}
}; };
constexpr size_t kTensorMaxDim = 10; constexpr size_t kTensorMaxDim = 10;
using TensorShape = ShapeBase<kTensorMaxDim, int64_t>; using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
......
...@@ -81,16 +81,7 @@ def get_indice_pairs(indices, ...@@ -81,16 +81,7 @@ def get_indice_pairs(indices,
else: else:
out_shape = spatial_shape out_shape = spatial_shape
if grid is None: if grid is None:
if ndim == 2: res = torch.ops.spconv.get_indice_pairs_v2(indices, batch_size, out_shape,
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_2d
elif ndim == 3:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_3d
elif ndim == 4:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_4d
else:
raise NotImplementedError
res = get_indice_pairs_func(indices, batch_size, out_shape,
spatial_shape, ksize, stride, padding, spatial_shape, ksize, stride, padding,
dilation, out_padding, int(subm), dilation, out_padding, int(subm),
int(transpose), int(use_hash)) int(transpose), int(use_hash))
...@@ -115,7 +106,7 @@ def indice_conv(features, ...@@ -115,7 +106,7 @@ def indice_conv(features,
num_activate_out, num_activate_out,
inverse=False, inverse=False,
subm=False): subm=False):
return torch.ops.spconv.indice_conv(features, filters, indice_pairs, return torch.ops.spconv.indice_conv_v2(features, filters, indice_pairs,
indice_pair_num, num_activate_out, indice_pair_num, num_activate_out,
int(inverse), int(subm)) int(inverse), int(subm))
......
...@@ -12,28 +12,27 @@ ...@@ -12,28 +12,27 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h> #include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h> #include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h> #include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <torch/script.h> #include <torch/script.h>
#include <spconv/fused_spconv_ops.h>
static auto registry = static auto registry =
torch::RegisterOperators() torch::RegisterOperators()
.op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>) .op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>) .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
.op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>) .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
.op("spconv::get_indice_pairs_v2", &spconv::getIndicePairV2)
.op("spconv::get_indice_pairs_grid_2d", .op("spconv::get_indice_pairs_grid_2d",
&spconv::getIndicePairPreGrid<2>) &spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d", .op("spconv::get_indice_pairs_grid_3d",
&spconv::getIndicePairPreGrid<3>) &spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv", &spconv::indiceConv) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_fp32", .op("spconv::fused_indice_conv_bn",
&spconv::fusedIndiceConvBatchNorm<float>) &spconv::fusedIndiceConvBatchNorm)
.op("spconv::fused_indice_conv_half",
&spconv::fusedIndiceConvBatchNorm<at::Half>)
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>) .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
.op("spconv::indice_maxpool_backward_fp32", .op("spconv::indice_maxpool_backward_fp32",
&spconv::indiceMaxPoolBackward<float>) &spconv::indiceMaxPoolBackward<float>)
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <spconv/geometry.h> #include <spconv/geometry.h>
#include <spconv/indice.h> #include <spconv/indice.h>
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <tensorview/tensor.h>
#include <torch/script.h> #include <torch/script.h>
namespace spconv { namespace spconv {
...@@ -253,6 +254,79 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -253,6 +254,79 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
} }
#endif #endif
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash) {
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (transpose)
numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
di.data(), ou.data());
else
numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
di.data(), ou.data());
});
});
return numActIn;
}
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = decltype(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
ks.data(), st.data(), pa.data(), di.data(), ou.data());
});
});
return numActIn;
}
namespace functor { namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> { struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
......
...@@ -38,39 +38,45 @@ int create_conv_indice_pair_p1_cuda( ...@@ -38,39 +38,45 @@ int create_conv_indice_pair_p1_cuda(
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size(); auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0); auto numActIn = indicesIn.size(0);
auto kernelVolume = indicePairs.size(0);
if (numActIn == 0) if (numActIn == 0)
return 0; return 0;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8. tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) { using Index = decltype(IndexValue);
using Index = decltype(V);
using IndexGrid = int32_t; using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) { tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I; constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end()); tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end()); tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end()); tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end()); tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(), tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end()); outSpatialShape.end());
tv::dispatch_int<16, 32, 256, 4096>(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
if (transpose) { if (transpose) {
prepareDeConvIndicePairsKernel<Index, NDim, 4096> prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
stream>>>(tv::torch2tv<Index>(indicesIn), 0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di, tv::torch2tv<Index>(indicePairUnique), ks, st,
ou); pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
} else { } else {
prepareIndicePairsKernel<Index, NDim, 4096> prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
stream>>>(tv::torch2tv<Index>(indicesIn), 0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st, pa, di, tv::torch2tv<Index>(indicePairUnique), ks, st,
ou); pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
} }
}); });
}); });
});
return 1; return 1;
} }
...@@ -88,12 +94,11 @@ int create_conv_indice_pair_p2_cuda( ...@@ -88,12 +94,11 @@ int create_conv_indice_pair_p2_cuda(
auto kernelVolume = indicePairs.size(0); auto kernelVolume = indicePairs.size(0);
if (numActIn == 0) if (numActIn == 0)
return 0; return 0;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8. tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) { using Index = decltype(IndexValue);
using Index = decltype(V);
using IndexGrid = int32_t; using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) { tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I; constexpr int NDim = decltype(I)::value;
using IndexGrid = int32_t; using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(), tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end()); outSpatialShape.end());
...@@ -122,6 +127,8 @@ int create_conv_indice_pair_p2_cuda( ...@@ -122,6 +127,8 @@ int create_conv_indice_pair_p2_cuda(
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numAct, stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
tv::torch2tv<Index>(indicePairUnique), ou, batchSize); tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
auto tableSize = table.get_table_size(); auto tableSize = table.get_table_size();
auto tableData = table.data(); auto tableData = table.data();
auto constants = table.get_constants_4(); auto constants = table.get_constants_4();
...@@ -133,6 +140,7 @@ int create_conv_indice_pair_p2_cuda( ...@@ -133,6 +140,7 @@ int create_conv_indice_pair_p2_cuda(
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), tableSize, tv::torch2tv<Index>(indicePairUnique), tableSize,
tableData, constants, stash_constants, stash_count); tableData, constants, stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");
} else { } else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim> assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
...@@ -145,7 +153,7 @@ int create_conv_indice_pair_p2_cuda( ...@@ -145,7 +153,7 @@ int create_conv_indice_pair_p2_cuda(
assignIndicePairsKernel<Index, IndexGrid, NDim> assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct, tv::torch2tv<IndexGrid>(gridsOut), numActIn,
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou); tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
...@@ -177,11 +185,11 @@ int create_submconv_indice_pair_cuda( ...@@ -177,11 +185,11 @@ int create_submconv_indice_pair_cuda(
auto kernelVolume = indicePairs.size(0); auto kernelVolume = indicePairs.size(0);
if (numActIn == 0) if (numActIn == 0)
return 0; return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto V) { tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = decltype(V); using Index = decltype(IndexValue);
using IndexGrid = int32_t; using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) { tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = I; constexpr int NDim = decltype(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end()); tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end()); tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end()); tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
...@@ -214,26 +222,36 @@ int create_submconv_indice_pair_cuda( ...@@ -214,26 +222,36 @@ int create_submconv_indice_pair_cuda(
auto constants = table.get_constants_4(); auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants(); auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count(); auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096> tv::dispatch_int<16, 32, 256, 4096>(
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, kernelVolume, std::less_equal<int>(), [&](auto I2) {
stream>>>(tv::torch2tv<Index>(indicesIn), constexpr int MaxKernelVolume = decltype(I2)::value;
getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou, tv::torch2tv<Index>(indiceNum), ks, st, pa,
tableSize, tableData, constants, stash_constants, di, ou, tableSize, tableData, constants,
stash_count); stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
});
} else { } else {
prepareSubMGridKernel<Index, IndexGrid, NDim> prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn), stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), ou); tv::torch2tv<IndexGrid>(gridsOut), ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed"); TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096> tv::dispatch_int<16, 32, 256, 4096>(
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0, ndim, std::less_equal<int>(), [&](auto I2) {
stream>>>(tv::torch2tv<Index>(indicesIn), constexpr int MaxKernelVolume = decltype(I2)::value;
getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou); tv::torch2tv<Index>(indiceNum), ks, st, pa,
di, ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
});
} }
if (resetGrid && (!useHash)) { if (resetGrid && (!useHash)) {
......
...@@ -14,59 +14,60 @@ ...@@ -14,59 +14,60 @@
#include <ATen/Parallel.h> #include <ATen/Parallel.h>
#include <spconv/reordering.h> #include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h> #include <torch/script.h>
namespace spconv { namespace spconv {
namespace functor { using float_types_t = tv::mp_list<float, double, at::Half>;
template <typename T, typename Index> using int_types_t = tv::mp_list<int32_t, int64_t>;
struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU &d, tv::TensorView<T> buffer, void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
tv::TensorView<const T> features, torch::Tensor indices, int size) {
tv::TensorView<const Index> indices, int size) { int numPlanes = features.size(1);
int numPlanes = features.dim(1); auto dtype = features.scalar_type();
auto int_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
Index *indices_data = indices.data_ptr<Index>();
T *buffer_data = buffer.data_ptr<T>();
const T *features_data = features.data_ptr<T>();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) { at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
for (int i = begin; i < end; ++i) { for (int i = begin; i < end; ++i) {
std::memcpy(buffer.data() + i * numPlanes, std::memcpy(buffer_data + i * numPlanes,
features.data() + indices[i] * numPlanes, features_data + indices_data[i] * numPlanes,
sizeof(T) * numPlanes); sizeof(T) * numPlanes);
} }
}); });
} });
}; });
}
template <typename T, typename Index> void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
struct SparseScatterAddFunctor<tv::CPU, T, Index> { torch::Tensor indices, int size) {
void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures, int numPlanes = outFeatures.size(1);
tv::TensorView<const T> buffer, auto dtype = outFeatures.scalar_type();
tv::TensorView<const Index> indices, int size, bool stable) { auto int_dtype = indices.scalar_type();
int numPlanes = outFeatures.dim(1);
const T *buf = buffer.data(); tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
T *out = outFeatures.data(); using T = decltype(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue);
Index *indices_data = indices.data_ptr<Index>();
const T *buffer_data = buffer.data_ptr<T>();
T *features_data = outFeatures.data_ptr<T>();
const T *buf = buffer.data_ptr<T>();
T *out = outFeatures.data_ptr<T>();
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes; buf = buffer_data + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes; out = features_data + indices_data[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) { for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j]; out[j] += buf[j];
} }
} }
} });
}; });
}
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
...@@ -20,23 +20,31 @@ ...@@ -20,23 +20,31 @@
#include <tensorview/cuda_utils.h> #include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h> #include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h> #include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <type_traits> #include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
namespace spconv { namespace spconv {
namespace functor {
template <typename T, typename Index> void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
struct SparseGatherFunctor<tv::GPU, T, Index> { torch::Tensor indices, int size) {
if (size <= 0)
return;
int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float, double,
at::Half>(features.scalar_type(), [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features, tv::dispatch_torch<int32_t, int64_t>(
tv::TensorView<const Index> indices, int size) { indices.scalar_type(), [&](auto IndexValue) {
if (size <= 0) using Index = decltype(IndexValue);
return;
int numPlanes = features.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices, tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
...@@ -47,11 +55,12 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -47,11 +55,12 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) { if (nHotBlock >= NumTLP) {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP), <<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(), stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data(), nHotBlock, indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
...@@ -60,10 +69,10 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -60,10 +69,10 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(1, numPlanes / NumTLP), <<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0, dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes, stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock, features.data_ptr<T>(),
size - nHotBlock, indices.data_ptr<Index>() + nHotBlock,
numPlanes / vecloadFactor); size - nHotBlock, numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -77,49 +86,63 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -77,49 +86,63 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherGenericKernel<T, Index, NumTLP, NumILP> gatherGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP), <<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)), tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data(), features.data(), indices.data(), size, numPlanes); buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
} });
}; });
template <typename T, typename Index> }
struct SparseScatterAddFunctor<tv::GPU, T, Index> {
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float, double, at::Half>(
outFeatures.scalar_type(), [&](auto TValue) {
using T = decltype(TValue);
using vecload_type_t = using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::dispatch_torch<int32_t, int64_t>(
tv::TensorView<const Index> indices, int size, bool stable) { indices.scalar_type(), [&](auto IndexValue) {
if (size <= 0) using Index = decltype(IndexValue);
return;
int numPlanes = outFeatures.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half. sizeof(vecload_type_t) / sizeof(T); // important for half.
tv::mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices, tv::mp_for_each<kernel_block_t>(
&notFound](auto NumTLP) { [=, &outFeatures, &buffer, &indices, &notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); // constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr int NumILP = NumTLP / 4; constexpr int NumILP = NumTLP / 4;
int nHotBlock = (size / NumTLP) * NumTLP; int nHotBlock = (size / NumTLP) * NumTLP;
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (nHotBlock >= NumTLP) { if (nHotBlock >= NumTLP) {
scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP, scatterAddVecBlockKernel<T, Index, int(NumTLP),
vecload_type_t> NumILP, vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP), <<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP),
d.getStream()>>>(outFeatures.data(), buffer.data(), 0, stream>>>(outFeatures.data_ptr<T>(),
indices.data(), nHotBlock, buffer.data_ptr<T>(),
indices.data_ptr<Index>(),
nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
if (size - nHotBlock > 0) { if (size - nHotBlock > 0) {
scatterAddGenericKernel<T, Index, int(NumTLP), NumILP> scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP), <<<dim3(1, numPlanes / NumTLP),
0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data(), buffer.data() + nHotBlock * numPlanes, outFeatures.data_ptr<T>(),
indices.data() + nHotBlock, size - nHotBlock, numPlanes); buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -132,25 +155,13 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -132,25 +155,13 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
scatterAddGenericKernel<T, Index, NumTLP, NumILP> scatterAddGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(tv::cuda::DivUp(size, NumTLP), <<<dim3(tv::cuda::DivUp(size, NumTLP),
tv::cuda::DivUp(numPlanes, NumTLP)), tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>( dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data(), buffer.data(), indices.data(), size, outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
numPlanes); indices.data_ptr<Index>(), size, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
} });
}; });
} // namespace functor }
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
\ No newline at end of file
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
namespace spconv { namespace spconv {
std::vector<torch::Tensor>
getIndicePairV2(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto NDim = kernelSize.size();
// CPU always use hash (tsl::robin_map).
bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
std::string msg = "due to limits of cuda hash, the volume of dense space "
"include batch size ";
msg += "must less than std::numeric_limits<int>::max() = 2e9";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
msg);
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
auto gridSize = batchSize * outputVolume;
if (useHash) {
gridSize = batchSize;
}
torch::Tensor gridOut = torch::full(
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1;
for (int i = 0; i < NDim; ++i) {
if (subM) {
padding[i] = kernelSize[i] / 2;
stride[i] = 1;
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
numActOut = create_submconv_indice_pair_cpu(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_submconv_indice_pair_cuda(
indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
dilation, outSpatialShape, transpose, false, useHash);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {indices, indicePairs, indiceNum};
} else {
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
numActOut = create_conv_indice_pair_cpu(
indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
padding, dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda(
indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = create_conv_indice_pair_p2_cuda(
indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
outSpatialShape, transpose, false, useHash);
}
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) { int64_t numActOut, int64_t _inverse, int64_t _subM) {
...@@ -47,9 +153,6 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -47,9 +153,6 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0; double totalGatherTime = 0;
double totalGEMMTime = 0; double totalGEMMTime = 0;
double totalSAddTime = 0; double totalSAddTime = 0;
tv::dispatch_torch<float, double, at::Half>(
features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
...@@ -57,25 +160,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -57,25 +160,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
} }
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob( auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options); outputBuffer.data_ptr(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(),
{nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion /* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64); auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
...@@ -92,22 +186,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -92,22 +186,11 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// totalGEMMTime += timer.report() / 1000.0; // totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse], nHot);
scatterFtor(
tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse], nHot);
scatterFtor(
tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
} }
#endif #endif
else { else {
...@@ -115,13 +198,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -115,13 +198,14 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
} }
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
} }
});
// std::cout << "gather time " << totalGatherTime << std::endl; // std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl; // std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl; // std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output; return output;
} }
std::vector<torch::Tensor> std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters, indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor outGrad, torch::Tensor indicePairs,
...@@ -158,40 +242,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -158,40 +242,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(filterGradSub, features.t(), outGrad); torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t()); torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
} }
tv::dispatch_torch<float, double,
at::Half>(features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i]; auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut; sparse_gather_cpu(outputBuffer, outGrad, indicePairs[i][!inverse], nHot);
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut; sparse_gather_cuda(outputBuffer, outGrad, indicePairs[i][!inverse], nHot);
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
} }
#endif #endif
else { else {
...@@ -199,36 +262,27 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -199,36 +262,27 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
} }
auto filterGradSub = filtersGrad[i]; auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(), auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr(),
{nHot, numInPlanes}, options); {nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob); torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t()); torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) { if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[i][inverse], nHot);
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (device == torch::kCUDA) { else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[i][inverse], nHot);
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
} }
#endif #endif
else { else {
TV_ASSERT_INVALID_ARG(false, "unknown device type"); TV_ASSERT_INVALID_ARG(false, "unknown device type");
} }
} }
});
return {inputGrad, filtersGrad.view(filterShape)}; return {inputGrad, filtersGrad.view(filterShape)};
} }
} // namespace spconv } // namespace spconv
\ No newline at end of file
...@@ -392,7 +392,7 @@ class TestSpConv(TestCase): ...@@ -392,7 +392,7 @@ class TestSpConv(TestCase):
def testSpDeConv3d(self): def testSpDeConv3d(self):
np.random.seed(484) np.random.seed(484)
devices = ["cuda:0", "cpu:0"] devices = ["cuda:0"]
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1, 2] batchsizes = [1, 2]
...@@ -598,9 +598,9 @@ def main(): ...@@ -598,9 +598,9 @@ def main():
shapes = [[50, 30, 30]] shapes = [[50, 30, 30]]
batchsizes = [2] batchsizes = [2]
in_channels = [256] in_channels = [32]
out_channels = [256] out_channels = [64]
ksizes = [(3, 1, 1)] ksizes = [(3, 3, 3)]
strides = [1] strides = [1]
paddings = [0] paddings = [0]
dilations = [1] dilations = [1]
...@@ -654,5 +654,6 @@ def main(): ...@@ -654,5 +654,6 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() # main()
# unittest.main() # unittest.main()
TestSpConv().testSpDeConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment