Commit 9ce18407 authored by yanyan's avatar yanyan
Browse files

add SparseConvNet algorithm

parent 47bea04c
......@@ -37,8 +37,8 @@ observers:
libpaths: [
/home/yy/anaconda3/lib,
]
libraries: [-lnvinfer, -lpython3.7m, -lcublas, -lcudart, -ljpeg]
std: c++2a
libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg]
std: c++14
options: [-Wall, -Wextra]
cudadev:
......@@ -65,11 +65,11 @@ observers:
std: c++14
options: [
-Wno-deprecated-declarations,
"-gencode=arch=compute_52,code=sm_52",
# "-gencode=arch=compute_52,code=sm_52",
"-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_60,code=sm_60",
"-gencode=arch=compute_70,code=sm_70",
"-gencode=arch=compute_75,code=sm_75",
# "-gencode=arch=compute_60,code=sm_60",
# "-gencode=arch=compute_70,code=sm_70",
# "-gencode=arch=compute_75,code=sm_75",
]
torchdev:
......
This diff is collapsed.
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
} // namespace spconv
......@@ -20,6 +20,18 @@
#include <tensorview/tensorview.h>
namespace spconv {
template <bool UseDeconv, typename Index, unsigned NDim> struct ConvIndiceDispatch;
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<true, Index, NDim>{
constexpr static auto* func = getValidOutPosTranspose<Index, NDim>;
};
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<false, Index, NDim>{
constexpr static auto* func = getValidOutPos<Index, NDim>;
};
template <typename Index, unsigned NDim, bool UseDeconv,
int KernelMaxVolume = 256, typename Index1D = int>
__global__ void prepareIndicePairsKernel(
......@@ -47,19 +59,10 @@ __global__ void prepareIndicePairsKernel(
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
if (UseDeconv) {
// nvcc will optimize this fake "if constexpr"
// after cuda 11 released, we will start to use real if constexpr.
numValidPoints = getValidOutPosTranspose<Index, NDim>(
numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
} else {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
}
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
......
......@@ -23,7 +23,9 @@
namespace spconv {
enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
enum ConvAlgo { kNative = 0, kBatch, kBatchGemmGather, kSparseConvNet };
using all_conv_algos_t =
tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather, kSparseConvNet>;
// torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor>
......
#pragma once
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace spconv {
......
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
......
/*
From PyTorch:
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
From Caffe2:
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
All contributions by Facebook:
Copyright (c) 2016 Facebook Inc.
All contributions by Google:
Copyright (c) 2015 Google Inc.
All rights reserved.
All contributions by Yangqing Jia:
Copyright (c) 2015 Yangqing Jia
All rights reserved.
All contributions from Caffe:
Copyright(c) 2013, 2014, 2015, the respective contributors
All rights reserved.
All other contributions:
Copyright(c) 2015, 2016 the respective contributors
All rights reserved.
Caffe2 uses a copyright model similar to Caffe: each contributor holds
copyright over their contributions to Caffe2. The project versioning records
all such contribution and copyright details. If a contributor wants to further
mark their specific copyright on a particular contribution, they should
indicate their copyright solely in the commit message of the change when it is
committed.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <type_traits>
#include <utility>
namespace tv {
#ifdef __cpp_lib_void_t
template <class T> using void_t = std::void_t<T>;
#else
// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
// (it takes CWG1558 into account and also works for older compilers)
template <typename... Ts> struct make_void { typedef void type; };
template <typename... Ts> using void_t = typename make_void<Ts...>::type;
#endif
namespace detail {
struct _identity final {
template <class T> using type_identity = T;
template <class T> decltype(auto) operator()(T &&arg) {
return std::forward<T>(arg);
}
};
template<class Func, class Enable = void>
struct function_takes_identity_argument : std::false_type {};
#if defined(_MSC_VER)
// For some weird reason, MSVC shows a compiler error when using guts::void_t instead of std::void_t.
// But we're only building on MSVC versions that have std::void_t, so let's just use that one.
template<class Func>
struct function_takes_identity_argument<Func, std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
#else
template<class Func>
struct function_takes_identity_argument<Func, void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
#endif
template<bool Condition>
struct _if_constexpr;
template<>
struct _if_constexpr<true> final {
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
// The _identity instance passed in can be used to delay evaluation of an expression,
// because the compiler can't know that it's just the identity we're passing in.
return thenCallback(_identity());
}
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
return thenCallback();
}
};
template<>
struct _if_constexpr<false> final {
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
// The _identity instance passed in can be used to delay evaluation of an expression,
// because the compiler can't know that it's just the identity we're passing in.
return elseCallback(_identity());
}
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
return elseCallback();
}
};
} // namespace detail
/*
* Get something like C++17 if constexpr in C++14.
*
* Example 1: simple constexpr if/then/else
* template<int arg> int increment_absolute_value() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* [&] { ++result; } // then-case
* [&] { --result; } // else-case
* );
* return result;
* }
*
* Example 2: without else case (i.e. conditionally prune code from assembly)
* template<int arg> int decrement_if_positive() {
* int result = arg;
* if_constexpr<(arg > 0)>(
* // This decrement operation is only present in the assembly for
* // template instances with arg > 0.
* [&] { --result; }
* );
* return result;
* }
*
* Example 3: branch based on type (i.e. replacement for SFINAE)
* struct MyClass1 {int value;};
* struct MyClass2 {int val};
* template <class T>
* int func(T t) {
* return if_constexpr<std::is_same<T, MyClass1>::value>(
* [&](auto _) { return _(t).value; }, // this code is invalid for T == MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T == MyClass1
* );
* }
*
* Note: The _ argument passed in Example 3 is the identity function, i.e. it does nothing.
* It is used to force the compiler to delay type checking, because the compiler
* doesn't know what kind of _ is passed in. Without it, the compiler would fail
* when you try to access t.value but the member doesn't exist.
*
* Note: In Example 3, both branches return int, so func() returns int. This is not necessary.
* If func() had a return type of "auto", then both branches could return different
* types, say func<MyClass1>() could return int and func<MyClass2>() could return string.
*/
template<bool Condition, class ThenCallback, class ElseCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
// This will give us better error messages.
if constexpr(Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
} else {
if constexpr (detail::function_takes_identity_argument<ElseCallback>::value) {
return std::forward<ElseCallback>(elseCallback)(detail::_identity());
} else {
return std::forward<ElseCallback>(elseCallback)();
}
}
#else
// C++14 implementation of if constexpr
return detail::_if_constexpr<Condition>::call(std::forward<ThenCallback>(thenCallback),
std::forward<ElseCallback>(elseCallback));
#endif
}
template<bool Condition, class ThenCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
// This will give us better error messages.
if constexpr(Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
}
#else
// C++14 implementation of if constexpr
return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback), [] (auto) {});
#endif
}
}
......@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <sstream>
#ifdef TV_USE_STACKTRACE
......
......@@ -28,6 +28,7 @@ If you can use libtorch, dont use tv::Tensor.
#include <iomanip>
#include <memory>
#include <type_traits>
#include "cc17.h"
#ifdef TV_CUDA
#include <cuda_fp16.h>
#include <cuda_runtime.h>
......@@ -623,36 +624,17 @@ struct Tensor {
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<T, Rank, PtrTraits, Tindex> tview() {
writable_check();
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
for (size_t i = 0; i < ndim(); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<T, Rank, PtrTraits, Tindex>(
reinterpret_cast<T *>(data<T>()), shape, stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<(Rank > 0), int>::type = 0>
typename Tindex = int>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
if (Rank > 0) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
}
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
return if_constexpr<(Rank > 0)>([&](auto _){
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
......@@ -660,20 +642,8 @@ struct Tensor {
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
}
template <typename T, int Rank = -1,
template <class> class PtrTraits = DefaultPtrTraits,
typename Tindex = int,
typename std::enable_if<Rank == -1, int>::type = 0>
TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
if (Rank > 0) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
}
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
ShapeBase<TV_MAX_DIM, Tindex> shape(ndim()), stride(ndim());
}, [&](auto _){
ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
for (int i = 0; i < int(ndim()); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
......@@ -681,8 +651,8 @@ struct Tensor {
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
});
}
template <class... Inds> Tensor view(Inds... newShapes) const {
static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
TensorShape shape{int(newShapes)...};
......@@ -780,10 +750,10 @@ struct Tensor {
int dim(int idx) const {
if (idx < 0) {
TV_ASSERT_RT_ERR(shape_.size() + idx < shape_.size(), idx, shape_);
return shape_[shape_.size() + idx];
TV_ASSERT_RT_ERR(shape_.ndim() + idx < shape_.ndim(), idx, shape_);
return shape_[shape_.ndim() + idx];
} else {
TV_ASSERT_RT_ERR(idx < int(shape_.size()), idx, shape_);
TV_ASSERT_RT_ERR(idx < int(shape_.ndim()), idx, shape_);
return shape_[idx];
}
}
......
......@@ -35,6 +35,24 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
mCurTime = std::chrono::steady_clock::now();
return res;
}
template <int Count, typename F>
double benchmark(F&& f, int start=int(Count) * 0.3){
// std::vector<TimeT::rep> times;
auto res = typename TimeT::rep();
int count = 0;
cudaDeviceSynchronize();
for (int i = 0; i < Count; ++i){
std::forward<F>(f)();
auto time = report();
if (i >= start){
// times.push_back(time)
res += time;
count += 1;
}
}
return res / double(count);
}
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
......
......@@ -23,7 +23,7 @@ class ConvAlgo(Enum):
Native = 0 # small memory cost, faster when number of points is large.
Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium
SparseConvNet = 3
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
......
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
from spconv.modules import SparseModule
class RemoveDuplicate(SparseModule):
def forward(self, x: spconv.SparseConvTensor):
inds = x.indices
spatial_shape = [x.batch_size, *x.spatial_shape]
spatial_stride = [0] * len(spatial_shape)
val = 1
for i in range(inds.shape[1] - 1, -1, -1):
spatial_stride[i] = val
val *= spatial_shape[i]
indices_index = inds[:, -1]
for i in range(len(spatial_shape) - 1):
indices_index += spatial_stride[i] * inds[:, i]
_, unique_inds = torch.unique(indices_index)
new_inds = inds[unique_inds]
new_features = x.features[unique_inds]
res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape, x.batch_size, x.grid)
return res
\ No newline at end of file
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc fused_conv.cu)
endif()
add_library(spconv SHARED ${ALL_FILES})
......
#include <ATen/ATen.h>
#include <spconv/fused_conv.cu.h>
#include <spconv/fused_conv.h>
#include <tensorview/torch_utils.h>
namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = output.scalar_type();
auto input_nPlanes = features.size(1);
auto output_nPlanes = output.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float, at::Half>(dtype, [&](auto I) {
using T = decltype(I);
dConvolution_forward2(stream, features.data_ptr<T>(), output.data_ptr<T>(),
filters.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
indicesOut.data_ptr<int32_t>(), nHot, input_nPlanes,
input_nPlanes, output_nPlanes, output_nPlanes, 1);
});
}
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = features.scalar_type();
auto input_nPlanes = features.size(1);
auto output_nPlanes = dout.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
dConvolution_backward_dW2(
stream, features.data_ptr<T>(), din.data_ptr<T>(), dout.data_ptr<T>(),
filters.data_ptr<T>(), dfilters.data_ptr<T>(),
indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>(), nHot,
input_nPlanes, input_nPlanes, output_nPlanes, output_nPlanes, 1);
});
}
} // namespace spconv
\ No newline at end of file
......@@ -28,7 +28,6 @@
#include <thrust/execution_policy.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
......@@ -330,7 +329,6 @@ int create_submconv_indice_pair_cuda(
auto found = false;
if (dilation_one && (NDim == 2 || NDim == 3)) {
auto indiceNumCpu = indiceNum.cpu();
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
......
#include <spconv/fused_conv.h>
#include <spconv/spconv_ops.h>
#include <spgemm/gemm_th.h>
#include <tensorview/tensor.h>
namespace spconv {
......@@ -140,27 +142,11 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
}
}
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBatch(features, filters, indicePairs, indiceNum,
numActOut, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
......@@ -184,9 +170,9 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::mm_out(output, features, filters[indicePairMaxOffset]);
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return output;
}
......@@ -260,12 +246,59 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
return output;
}
torch::Tensor
indiceConvSparseConvNet(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("SparseConvNet only support gpu");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
fused_conv_cuda(output, features, filters[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return output;
}
template <bool BatchScatter>
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM,
bool batchScatter) {
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
auto batchScatter = BatchScatter;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
......@@ -388,29 +421,42 @@ torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
return output;
}
template <int Algo> struct ConvDispatch;
template <> struct ConvDispatch<kNative> {
constexpr static auto *func = indiceConvNative;
};
template <> struct ConvDispatch<kBatch> {
constexpr static auto *func = indiceConvBatch<false>;
};
template <> struct ConvDispatch<kBatchGemmGather> {
constexpr static auto *func = indiceConvBatch<true>;
};
template <> struct ConvDispatch<kSparseConvNet> {
constexpr static auto *func = indiceConvSparseConvNet;
};
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor res;
tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
constexpr int AlgoValue = decltype(I)::value;
res = ConvDispatch<AlgoValue>::func(features, filters, indicePairs,
indiceNum, numActOut, _inverse, _subM);
});
return res;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo) {
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
switch (algo) {
case kBatchGemmGather:
case kBatch: {
if (kernelVolume != 1) {
return indiceConvBackwardBatch(features, filters, outGrad, indicePairs,
indiceNum, _inverse, _subM,
algo != kBatchGemmGather);
} else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
bool subM = _subM != 0;
bool inverse = _inverse != 0;
......@@ -437,9 +483,9 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() +
indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return {inputGrad, filtersGrad.view(filterShape)};
}
......@@ -499,13 +545,66 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
}
std::vector<torch::Tensor>
indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
indiceConvBwSparseConvNet(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM, bool batchScatter) {
int64_t _subM) {
auto kernelVolume = indiceNum.size(0);
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
TV_THROW_INVALID_ARG("unknown device type");
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
fused_conv_backward_cuda(features, inputGrad, outGrad, filters[i],
filtersGrad[i], indicePairs[inverse][i],
indicePairs[!inverse][i], nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <bool BatchScatter>
std::vector<torch::Tensor>
indiceConvBwBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto batchScatter = BatchScatter;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indiceNum.size(0);
......@@ -626,4 +725,35 @@ indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
return {inputGrad, filtersGrad.view(filterShape)};
}
template <int Algo> struct ConvBwDispatch;
template <> struct ConvBwDispatch<kNative> {
constexpr static auto *func = indiceConvBwNative;
};
template <> struct ConvBwDispatch<kBatch> {
constexpr static auto *func = indiceConvBwBatch<false>;
};
template <> struct ConvBwDispatch<kBatchGemmGather> {
constexpr static auto *func = indiceConvBwBatch<true>;
};
template <> struct ConvBwDispatch<kSparseConvNet> {
constexpr static auto *func = indiceConvBwSparseConvNet;
};
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo) {
std::vector<torch::Tensor> res;
tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
constexpr int AlgoValue = decltype(I)::value;
res = ConvBwDispatch<AlgoValue>::func(
features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
});
return res;
}
} // namespace spconv
......@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
stride,
padding,
dilation,
algo=spconv.ConvAlgo.Native):
algo=spconv.ConvAlgo.SparseConvNet):
super().__init__()
layers = [
spconv.SparseConv3d(in_channels,
......@@ -353,7 +353,7 @@ class TestSpConv(TestCase):
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
in_channels = [32]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [1, 2, 3]
......@@ -623,8 +623,8 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
shapes = [[400, 400, 15]]
batchsizes = [2]
in_channels = [32]
out_channels = [64]
in_channels = [19]
out_channels = [17]
ksizes = [(3, 3, 3)]
strides = [1]
paddings = [0]
......@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if __name__ == '__main__':
main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
# main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
# TestSpConv().testSpConv3d()
TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment