Commit a6ae8967 authored by traveller59's avatar traveller59
Browse files

spconv v1.1 release:

1. add cuda hash support for cuda indice generation.
2. use hash table instead of dense table in CPU code.
3. add CPU-only build support.
parent 0757c45b
cmake_minimum_required(VERSION 3.13 FATAL_ERROR) cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(SparseConv LANGUAGES CXX CUDA VERSION 1.0)
option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." OFF) option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON)
set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags option(SPCONV_BuildCUDA "Build cuda code when BUILD_TESTING is enabled." ON)
if (SPCONV_BuildCUDA)
set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}") project(SparseConv LANGUAGES CXX CUDA VERSION 1.0)
get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
if(WIN32) # true if windows (32 and 64 bit)
set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
add_compile_definitions(TV_WINDOWS)
else() else()
set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64") project(SparseConv LANGUAGES CXX VERSION 1.0)
endif() endif()
# set(CMAKE_VERBOSE_MAKEFILE ON)
find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS}) set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_definitions(TV_DEBUG) add_compile_definitions(TV_DEBUG)
endif() endif()
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA) if (SPCONV_BuildCUDA)
string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}") set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}") get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
if(WIN32) # true if windows (32 and 64 bit)
set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
add_compile_definitions(TV_WINDOWS)
else()
set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
endif()
# set(CMAKE_VERBOSE_MAKEFILE ON)
find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
add_compile_definitions(SPCONV_CUDA)
endif()
# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
add_compile_definitions(SPCONV_CUDA)
add_subdirectory(third_party/pybind11) add_subdirectory(third_party/pybind11)
set(ALL_LIBS ${CUDA_CUDART} ${CUDA_CUBLAS} ${TORCH_LIBRARIES}) set(ALL_LIBS ${TORCH_LIBRARIES})
set(ALL_INCLUDE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
${PROJECT_SOURCE_DIR}/include)
if (SPCONV_BuildCUDA)
set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
add_subdirectory(src/cuhash)
endif()
add_subdirectory(src/spconv) add_subdirectory(src/spconv)
add_subdirectory(src/utils) add_subdirectory(src/utils)
add_subdirectory(src/hash)
if (SPCONV_BuildTests) if (SPCONV_BuildTests)
include(CTest) #adds option BUILD_TESTING (default ON) include(CTest) #adds option BUILD_TESTING (default ON)
......
...@@ -2,15 +2,15 @@ ...@@ -2,15 +2,15 @@
This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool. This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.
If you need more kinds of spatial layers such as avg pool, please implement it by yourself, I don't have time to do this.
The GPU Indice Generation algorithm is a unofficial implementation of paper [SECOND](http://www.mdpi.com/1424-8220/18/10/3337). That algorithm (don't include GPU SubM indice generation algorithm) may be protected by patent. The GPU Indice Generation algorithm is a unofficial implementation of paper [SECOND](http://www.mdpi.com/1424-8220/18/10/3337). That algorithm (don't include GPU SubM indice generation algorithm) may be protected by patent.
This project only support CUDA 9.0+. If you are using cuda 8.0, please update it to 9.0. This project only support CUDA 9.0+ or CPU only. If you are using cuda 8.0, please update it to 9.0.
This project only support tensors with spatial volume less than ```std::numeric_limits<int>::max()``` (~2e9). if someone really need very large space, open an issue.
## News: ## News:
2019-5-22: spconv v1.1 alpha released, now cuda hash implementation will be default. you can use ```use_hash=False``` to use dense implementation. you may see some message during running, they will be removed in future. 2019-5-24: spconv v1.1 released, now indice generation will use hash table as default (CPU code only support hash table). you can use ```use_hash=False``` to use dense table when using CUDA. In addition, add CPU only build support.
## Install on Ubuntu 16.04/18.04 ## Install on Ubuntu 16.04/18.04
...@@ -20,7 +20,7 @@ This project only support CUDA 9.0+. If you are using cuda 8.0, please update it ...@@ -20,7 +20,7 @@ This project only support CUDA 9.0+. If you are using cuda 8.0, please update it
2. Download cmake >= 3.13.2, then add cmake executables to PATH. 2. Download cmake >= 3.13.2, then add cmake executables to PATH.
3. Ensure you have installed pytorch 1.0 in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```). 3. Ensure you have installed pytorch 1.0+ in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
4. Run ```cd ./dist```, use pip to install generated whl file. 4. Run ```cd ./dist```, use pip to install generated whl file.
...@@ -152,6 +152,20 @@ This implementation use gather-gemm-scatter framework to do sparse convolution. ...@@ -152,6 +152,20 @@ This implementation use gather-gemm-scatter framework to do sparse convolution.
* **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo) * **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo)
## Third party libraries
* [CUDPP](https://github.com/cudpp/cudpp): A cuda library. contains a cuda hash implementation.
* [robin-map](https://github.com/Tessil/robin-map): A fast c++ hash library. almost 2x faster than std::unordered_map in this project.
* [pybind11](https://github.com/pybind/pybind11): A head-only python c++ binding library.
* [prettyprint](https://github.com/louisdx/cxx-prettyprint): A head-only library for container print.
## License ## License
This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details
The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.
The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
\ No newline at end of file
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include <algorithm> #include <algorithm>
namespace cudahash { namespace cuhash {
//! @name Debugging functions //! @name Debugging functions
/// @{ /// @{
......
...@@ -58,7 +58,7 @@ inline void PrintMessage(const char *message, const bool error = false) { ...@@ -58,7 +58,7 @@ inline void PrintMessage(const char *message, const bool error = false) {
//! Prints a message out to the console. //! Prints a message out to the console.
inline void PrintMessage(const char *message, const bool error = false) { inline void PrintMessage(const char *message, const bool error = false) {
if (error) { if (error) {
printf("!!! %s\n", message); printf("cudahash: %s\n", message);
} else { } else {
printf("%s\n", message); printf("%s\n", message);
} }
...@@ -68,7 +68,7 @@ inline void PrintMessage(const char *message, const bool error = false) { ...@@ -68,7 +68,7 @@ inline void PrintMessage(const char *message, const bool error = false) {
/* ------------------------------------------------------------------------- /* -------------------------------------------------------------------------
Hash table constants and definitions. Hash table constants and definitions.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
namespace cudahash { namespace cuhash {
/** /**
* \addtogroup cudpp_hash_data_structures * \addtogroup cudpp_hash_data_structures
...@@ -111,6 +111,6 @@ const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(), ...@@ -111,6 +111,6 @@ const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(),
/** @} */ // end cudpp_hash_data_structures /** @} */ // end cudpp_hash_data_structures
}; // namespace cudahash }; // namespace cuhash
#endif #endif
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include <vector_types.h> #include <vector_types.h>
#include "definitions.h" #include "definitions.h"
namespace cudahash { namespace cuhash {
//! Prime number larger than the largest practical hash table size. //! Prime number larger than the largest practical hash table size.
const unsigned kPrimeDivisor = 4294967291u; const unsigned kPrimeDivisor = 4294967291u;
...@@ -89,6 +89,7 @@ unsigned stash_hash_function(const uint2 stash_constants, ...@@ -89,6 +89,7 @@ unsigned stash_hash_function(const uint2 stash_constants,
return (stash_constants.x ^ key + stash_constants.y) % kStashSize; return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
} }
unsigned generate_random_uint32();
}; // namespace CuckooHashing }; // namespace CuckooHashing
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <driver_types.h> #include <driver_types.h>
namespace cudahash { namespace cuhash {
//! Makes an 64-bit Entry out of a key-value pair for the hash table. //! Makes an 64-bit Entry out of a key-value pair for the hash table.
TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) { TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
/* ------------------------------------------------------------------------- /* -------------------------------------------------------------------------
Hash table code. Hash table code.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
namespace cudahash { namespace cuhash {
//! Compute how many thread blocks are required for the given number of threads. //! Compute how many thread blocks are required for the given number of threads.
dim3 ComputeGridDim(unsigned threads); dim3 ComputeGridDim(unsigned threads);
...@@ -113,7 +113,7 @@ class HashTable { ...@@ -113,7 +113,7 @@ class HashTable {
* The input keys are expected to be completely unique. * The input keys are expected to be completely unique.
* To reduce the chance of a failure, increase the space usage or number of * To reduce the chance of a failure, increase the space usage or number of
* functions. * functions.
* Keys are not allowed to be equal to cudahash::kKeyEmpty. * Keys are not allowed to be equal to cuhash::kKeyEmpty.
*/ */
virtual bool Build(const unsigned input_size, virtual bool Build(const unsigned input_size,
const unsigned *d_keys, const unsigned *d_keys,
......
void init_genrand(unsigned long s);
void init_by_array(unsigned long init_key[], int key_length);
unsigned long genrand_int32(void);
long genrand_int31(void);
double genrand_real1(void);
double genrand_real2(void);
double genrand_real3(void);
double genrand_res53(void);
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// This file is used for c++ unit test, but pytorch jit ops don't support c++ debug build.
#ifndef PARAMS_GRID_H_ #ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_ #define PARAMS_GRID_H_
#include <tuple> #include <tuple>
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#ifndef FUSED_SPARSE_CONV_OP_H_ #ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_ #define FUSED_SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h> #include <spconv/indice.h>
#include <spconv/reordering.h> #include <spconv/reordering.h>
#include <torch/script.h> #include <torch/script.h>
...@@ -84,7 +83,9 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -84,7 +83,9 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
} else { }
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
...@@ -97,6 +98,11 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -97,6 +98,11 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
torch::index_select_out(inputBufferBlob, features, 0, torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/ indicePairBlob);*/
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0; // totalGEMMTime += timer.report() / 1000.0;
...@@ -107,7 +113,9 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -107,7 +113,9 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true); true);
} else { }
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
...@@ -115,6 +123,11 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil ...@@ -115,6 +123,11 @@ torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor fil
true); true);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
} }
// std::cout << "gather time " << totalGatherTime << std::endl; // std::cout << "gather time " << totalGatherTime << std::endl;
......
...@@ -18,8 +18,51 @@ ...@@ -18,8 +18,51 @@
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <tsl/robin_map.h>
#include <unordered_map>
namespace spconv { namespace spconv {
namespace detail {
template <typename T> struct ToUnsigned;
template <> struct ToUnsigned<int>{
using type = uint32_t;
};
template <> struct ToUnsigned<long>{
using type = uint64_t;
};
template <typename T> struct FNVInternal;
template <> struct FNVInternal<uint32_t>
{
constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
constexpr static uint32_t prime = 0x01000193;
};
template <> struct FNVInternal<uint64_t>
{
constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
constexpr static uint64_t prime = 0x100000001b3;
};
}
template <typename T>
using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
template <typename T>
struct FNV1a : detail::FNVInternal<T>{
std::size_t operator()(const T* data, std::size_t size){
to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
for (std::size_t i = 0; i < size; ++i) {
hash *= detail::FNVInternal<T>::prime;
hash ^= static_cast<to_unsigned_t<T>>(data[i]);
}
return hash;
}
};
template <typename Index, unsigned NDim> template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize, const Index *kernelSize,
...@@ -169,6 +212,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn, ...@@ -169,6 +212,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0); batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>( numValidPoints = getValidOutPos<Index, NDim>(
...@@ -179,16 +223,16 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn, ...@@ -179,16 +223,16 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) + auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx; spatialVolume * batchIdx;
if (gridsOut[index] == -1) { if (hash.find(index) == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) { for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1]; indicesOut(numAct, k) = pointPtr[k - 1];
} }
indicesOut(numAct, 0) = batchIdx; indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++; hash[index] = numAct++;
} }
// indicePairs: [K, 2, L] // indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
} }
} }
return numAct; return numAct;
...@@ -220,6 +264,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn, ...@@ -220,6 +264,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0); batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>( numValidPoints = getValidOutPosTranspose<Index, NDim>(
...@@ -230,16 +275,16 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn, ...@@ -230,16 +275,16 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) + auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx; spatialVolume * batchIdx;
if (gridsOut[index] == -1) { if (hash.find(index) == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) { for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1]; indicesOut(numAct, k) = pointPtr[k - 1];
} }
indicesOut(numAct, 0) = batchIdx; indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++; hash[index] = numAct++;
} }
// indicePairs: [K, 2, L] // indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
} }
} }
return numAct; return numAct;
...@@ -271,12 +316,13 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -271,12 +316,13 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
Index index = 0; Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1, index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) + outSpatialShape) +
spatialVolume * indicesIn(j, 0); spatialVolume * indicesIn(j, 0);
gridsOut[index] = j; hash[index] = j;
} }
Index index = 0; Index index = 0;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
...@@ -288,9 +334,9 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -288,9 +334,9 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) + index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0); spatialVolume * indicesIn(j, 0);
if (gridsOut[index] > -1) { if (hash.find(index) == hash.end()) {
indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; indicePairs(offset, 1, indiceNum[offset]++) = hash[index];
} }
} }
} }
......
...@@ -14,18 +14,18 @@ ...@@ -14,18 +14,18 @@
#ifndef INDICE_CU_H_ #ifndef INDICE_CU_H_
#define INDICE_CU_H_ #define INDICE_CU_H_
#include <hash/hash_table.cuh> #include <cuhash/hash_table.cuh>
#include <spconv/geometry.h> #include <spconv/geometry.h>
#include <tensorview/helper_kernel.cu.h> #include <tensorview/helper_kernel.cu.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
namespace spconv { namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim, template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256> int KernelMaxVolume = 256, typename Index1D=int>
__global__ void prepareIndicePairsKernel( __global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut, tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs, tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique, tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize, const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
...@@ -151,8 +151,8 @@ __global__ void ...@@ -151,8 +151,8 @@ __global__ void
assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn, assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique, tv::TensorView<Index> indicePairUnique,
unsigned table_size, const cudahash::Entry *table, unsigned table_size, const cuhash::Entry *table,
cudahash::Functions<kNumHashFunctions> constants, cuhash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) { uint2 stash_constants, unsigned stash_count) {
Index index; Index index;
...@@ -162,9 +162,9 @@ assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn, ...@@ -162,9 +162,9 @@ assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
index = indicePairs(i, 1, ix); index = indicePairs(i, 1, ix);
if (index > -1) { if (index > -1) {
auto val = auto val =
cudahash::retrieve((unsigned)(index), table_size, cuhash::retrieve((unsigned)(index), table_size,
table, constants, stash_constants, stash_count); table, constants, stash_constants, stash_count);
assert(val != cudahash::kNotFound); assert(val != cuhash::kNotFound);
indicePairs(i, 1, ix) = (unsigned)val; indicePairs(i, 1, ix) = (unsigned)val;
} }
} }
...@@ -283,8 +283,8 @@ __global__ void getSubMIndicePairsHashKernel( ...@@ -283,8 +283,8 @@ __global__ void getSubMIndicePairsHashKernel(
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
unsigned table_size, const cudahash::Entry *table, unsigned table_size, const cuhash::Entry *table,
cudahash::Functions<kNumHashFunctions> constants, cuhash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) { uint2 stash_constants, unsigned stash_count) {
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1; Index spatialVolume = 1;
...@@ -307,9 +307,9 @@ __global__ void getSubMIndicePairsHashKernel( ...@@ -307,9 +307,9 @@ __global__ void getSubMIndicePairsHashKernel(
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) + index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0); spatialVolume * indicesIn(ix, 0);
auto val = auto val =
cudahash::retrieve((unsigned)(index), table_size, cuhash::retrieve((unsigned)(index), table_size,
table, constants, stash_constants, stash_count); table, constants, stash_constants, stash_count);
if (val != cudahash::kNotFound) { if (val != cuhash::kNotFound) {
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(offset, 1, oldNum) = val; indicePairs(offset, 1, oldNum) = val;
indicePairs(offset, 0, oldNum) = ix; indicePairs(offset, 0, oldNum) = ix;
......
...@@ -181,7 +181,7 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners, ...@@ -181,7 +181,7 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
} }
return keep; return keep;
} }
#ifdef SPCONV_CUDA
constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8; constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType> template <typename DType>
...@@ -196,6 +196,7 @@ int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out, ...@@ -196,6 +196,7 @@ int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
boxes.shape(0), boxes.shape(1), boxes.shape(0), boxes.shape(1),
nms_overlap_thresh, device_id); nms_overlap_thresh, device_id);
} }
#endif
} // namespace spconv } // namespace spconv
#endif #endif
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#ifndef NMS_TORCH_OP_H_ #ifndef NMS_TORCH_OP_H_
#define NMS_TORCH_OP_H_ #define NMS_TORCH_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h> #include <spconv/indice.h>
#include <spconv/reordering.h> #include <spconv/reordering.h>
#include <torch/script.h> #include <torch/script.h>
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#ifndef PILLAR_SCATTER_OP_H_ #ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_ #define PILLAR_SCATTER_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/pillar_scatter_functor.h> #include <spconv/pillar_scatter_functor.h>
#include <torch/script.h> #include <torch/script.h>
#include <torch_utils.h> #include <torch_utils.h>
...@@ -28,6 +27,7 @@ template <typename T> ...@@ -28,6 +27,7 @@ template <typename T>
torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors, torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::Tensor shape) { torch::Tensor shape) {
TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error"); TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
TV_ASSERT_RT_ERR(shape.dim() == 1, "error"); TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
TV_ASSERT_RT_ERR(shape.size(0) == 4, "error"); TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
TV_ASSERT_RT_ERR(features.dim() >= 3, "error"); TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
...@@ -42,10 +42,11 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors, ...@@ -42,10 +42,11 @@ torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]}, torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options()); features.options());
TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error"); TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
#ifdef SPCONV_CUDA
functor::PointPillarScatter<tv::GPU, T, int> ftor; functor::PointPillarScatter<tv::GPU, T, int> ftor;
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()), ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()),
tv::torch2tv<const T>(coors.squeeze())); tv::torch2tv<const T>(coors.squeeze()));
#endif
return canvas; return canvas;
} }
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#ifndef SPARSE_POOL_OP_H_ #ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_ #define SPARSE_POOL_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/maxpool.h> #include <spconv/maxpool.h>
#include <torch/script.h> #include <torch/script.h>
#include <torch_utils.h> #include <torch_utils.h>
...@@ -44,13 +43,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs, ...@@ -44,13 +43,19 @@ torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
forwardFtor(tv::CPU(), tv::torch2tv<T>(output), forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot); tv::torch2tv<const int>(indicePairs).subview(i), nHot);
} else { }
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor; functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output), forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot); tv::torch2tv<const int>(indicePairs).subview(i), nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else{
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalTime += timer.report() / 1000.0; // totalTime += timer.report() / 1000.0;
} }
// std::cout << "maxpool forward time " << totalTime << std::endl; // std::cout << "maxpool forward time " << totalTime << std::endl;
...@@ -80,7 +85,9 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features, ...@@ -80,7 +85,9 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad), tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const int>(indicePairs).subview(i), nHot); tv::torch2tv<const int>(indicePairs).subview(i), nHot);
} else { }
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor; functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures), backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
...@@ -88,6 +95,11 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features, ...@@ -88,6 +95,11 @@ torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
tv::torch2tv<const int>(indicePairs).subview(i), nHot); tv::torch2tv<const int>(indicePairs).subview(i), nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else{
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
} }
return inputGrad; return inputGrad;
} }
......
// Copyright 2019 Yan Yan // Copyright 2019 Yan Yan
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// //
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#ifndef SPARSE_CONV_OP_H_ #ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_ #define SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h> #include <spconv/indice.h>
#include <spconv/reordering.h> #include <spconv/reordering.h>
#include <torch/script.h> #include <torch/script.h>
...@@ -27,15 +26,17 @@ namespace spconv { ...@@ -27,15 +26,17 @@ namespace spconv {
template <unsigned NDim> template <unsigned NDim>
std::vector<torch::Tensor> std::vector<torch::Tensor>
getIndicePair(torch::Tensor indices, int64_t batchSize, getIndicePair(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape, std::vector<int64_t> outSpatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> spatialShape,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) { std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0; bool subM = _subM != 0;
bool transpose = _transpose != 0; bool transpose = _transpose != 0;
bool useHash = _useHash != 0; // CPU always use hash (tsl::robin_map).
bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
auto numAct = indices.size(0); auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error"); TV_ASSERT_RT_ERR(NDim == coorDim, "error");
...@@ -54,31 +55,29 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -54,31 +55,29 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
for (int i = 1; i < outSpatialShape.size(); ++i) { for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i]; outputVolume *= outSpatialShape[i];
} }
std::string msg = "due to limits of cuda hash, the volume of dense space include batch size "; std::string msg = "due to limits of cuda hash, the volume of dense space "
msg += "must less than std::numeric_limits<int>::max()"; "include batch size ";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(), msg); msg += "must less than std::numeric_limits<int>::max() = 2e9";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
msg);
torch::Tensor indicePairs = torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1, torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros( torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device())); {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
auto gridSize = batchSize * outputVolume; auto gridSize = batchSize * outputVolume;
if (useHash){ if (useHash) {
gridSize = 1; gridSize = batchSize;
} }
torch::Tensor gridOut = torch::Tensor gridOut = torch::full(
torch::full({gridSize}, -1, {gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
torch::dtype(torch::kInt32).device(indices.device())); gridOut = gridOut.view({batchSize, -1});
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1; int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32; tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32; tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32; tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32; tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32; tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique =
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]); outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]); kernelSize32.push_back(kernelSize[i]);
...@@ -97,40 +96,70 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -97,40 +96,70 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose, useHash); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
} else { dilation32, outSpatialShape32, transpose, false, useHash);
}
#ifdef SPCONV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose, useHash); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
if (numActOut == -1){
// build hash failed. use CPU algorithm
auto device = indices.device();
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, false, useHash);
return {indices.to(device), indicePairs.to(device), indiceNum.to(device)};
}
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
} }
return {indices, indicePairs, indiceNum}; return {indices, indicePairs, indiceNum};
} else { } else {
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor outInds = torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1}, torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) { if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>(); auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
stride32, padding32, dilation32, outSpatialShape32, transpose); kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
} else { transpose);
}
#ifdef SPCONV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtorP1 = auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 = auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = numActOut = getIndicePairFtorP1(
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32, tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose); padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) { if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique); auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res); indicePairUnique = std::get<0>(res);
...@@ -138,20 +167,43 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -138,20 +167,43 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, useHash); tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
false, useHash);
if (numActOut == -1){
// build hash failed. use CPU algorithm
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
auto device = indices.device();
outInds = outInds.to({torch::kCPU});
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
return {outInds.to(device).slice(0, 0, numActOut), indicePairs.to(device), indiceNum.to(device)};
}
} }
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum}; return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
} }
} }
template <unsigned NDim> template <unsigned NDim>
std::vector<torch::Tensor> std::vector<torch::Tensor> getIndicePairPreGrid(
getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize, torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape, std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) { std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose,
int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0; bool subM = _subM != 0;
bool transpose = _transpose != 0; bool transpose = _transpose != 0;
...@@ -178,7 +230,7 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -178,7 +230,7 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error"); TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs = torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1, torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros( torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device())); {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl; // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
...@@ -188,9 +240,9 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -188,9 +240,9 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv::SimpleVector<int, NDim> stride32; tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32; tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32; tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = auto indicePairUnique = torch::full(
torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(), {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]); outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]); kernelSize32.push_back(kernelSize[i]);
...@@ -209,42 +261,54 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -209,42 +261,54 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1); gridOut.fill_(-1);
} else { }
#ifdef SPCONV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
stride32, padding32, dilation32, outSpatialShape32, transpose, true); tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
} }
return {indices, indicePairs, indiceNum}; return {indices, indicePairs, indiceNum};
} else { } else {
torch::Tensor outInds = torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1}, torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) { if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>(); auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
stride32, padding32, dilation32, outSpatialShape32, transpose, true); kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1); gridOut.fill_(-1);
} else { }
#ifdef SPCONV_CUDA
else if (indices.device().type() == torch::kCUDA) {
auto getIndicePairFtorP1 = auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 = auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>(); functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = numActOut = getIndicePairFtorP1(
getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32, tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose); padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) { if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique); auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res); indicePairUnique = std::get<0>(res);
...@@ -252,19 +316,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -252,19 +316,22 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true); tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
} }
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum}; return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
} }
} }
template <typename T> template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) { int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0; bool subM = _subM != 0;
bool inverse = _inverse != 0; bool inverse = _inverse != 0;
auto device = features.device().type(); auto device = features.device().type();
...@@ -273,13 +340,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -273,13 +340,16 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
auto numInPlanes = features.size(1); auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1); auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){ /*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element( auto indicePairVecMaxSizeIter = std::max_element(
...@@ -293,7 +363,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -293,7 +363,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device()); // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options); torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options); torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes}); filters = filters.view({-1, numInPlanes, numOutPlanes});
...@@ -319,20 +390,28 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -319,20 +390,28 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
} else { nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion /* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64); auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot}, auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
indicePairOptions); {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
torch::index_select_out(inputBufferBlob, features, 0, features, 0, indicePairBlob);*/
indicePairBlob);*/ }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
} }
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0; // totalGEMMTime += timer.report() / 1000.0;
...@@ -341,16 +420,23 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -341,16 +420,23 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output), scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
} else { }
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer), tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
} }
// std::cout << "gather time " << totalGatherTime << std::endl; // std::cout << "gather time " << totalGatherTime << std::endl;
...@@ -362,8 +448,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -362,8 +448,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
template <typename T> template <typename T>
std::vector<torch::Tensor> std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters, indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor outGrad, torch::Tensor indicePairs,
int64_t _inverse, int64_t _subM) { torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0; bool subM = _subM != 0;
bool inverse = _inverse != 0; bool inverse = _inverse != 0;
...@@ -373,16 +459,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -373,16 +459,19 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
auto numInPlanes = features.size(1); auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1); auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
auto options = auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device()); torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes(); auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options); torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options); torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options); torch::zeros({indicePairMaxSize, numOutPlanes}, options);
...@@ -403,22 +492,33 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -403,22 +492,33 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer), gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad), tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
} else { nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer), gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad), tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
auto filterGradSub = filtersGrad[i]; auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = auto outputBufferBlob =
torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options); torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
...@@ -431,20 +531,29 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters, ...@@ -431,20 +531,29 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad), scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer), tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
} else { nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer), tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
} }
return {inputGrad, filtersGrad.view(filterShape)}; return {inputGrad, filtersGrad.view(filterShape)};
} }
template <typename T> template <typename T>
torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters, torch::Tensor
indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) { int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0; bool subM = _subM != 0;
...@@ -458,18 +567,22 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -458,18 +567,22 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
auto indicePairNumCpu = indiceNum.to({torch::kCPU}); auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum(); auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data<int>()[0]; auto totalActs = indicePairNumCpu.data<int>()[0];
auto indicePairMaxSizeIter = std::max_element( auto indicePairMaxSizeIter =
indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume); std::max_element(indicePairNumCpu.data<int>(),
int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>(); indicePairNumCpu.data<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data<int>();
int indicePairMaxSize = *indicePairMaxSizeIter; int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
indicePairNumCpu.data<int>() + kernelVolume); indicePairNumCpu.data<int>() +
kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end()); int subRuleMaxSize =
*std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subM) { if (subM) {
indicePairMaxSize = subRuleMaxSize; indicePairMaxSize = subRuleMaxSize;
} }
auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
auto options = auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device()); torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions = // auto indicePairOptions =
...@@ -481,7 +594,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -481,7 +594,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
torch::Tensor outputBuffer = torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options); torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes}); filters = filters.view({-1, numInPlanes, numOutPlanes});
std::cout << "create time " << timer.report()/1000.0 << std::endl; // std::cout << "create time " << timer.report() / 1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter if (subM) { // the center index of subm conv don't need gather and scatter
// add. // add.
torch::mm_out(output, features, filters[indicePairMaxOffset]); torch::mm_out(output, features, filters[indicePairMaxOffset]);
...@@ -495,7 +608,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -495,7 +608,7 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue; continue;
} }
// //
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(), auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
{nHot, numOutPlanes}, options); {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(), auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
...@@ -504,20 +617,28 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -504,20 +617,28 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob), gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
} else { nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor; functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob), gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features), tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot); tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// } // }
// for (int i = 0; i < kernelVolume; ++i) { // for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(), // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// {nHot, numOutPlanes}, options); // {nHot, numOutPlanes}, options);
} }
// totalGatherTime += timer.report() / 1000.0; // totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
...@@ -548,16 +669,23 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil ...@@ -548,16 +669,23 @@ torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor fil
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output), scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob), tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
} else { }
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor; functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output), scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob), tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot, tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
true); nHot, true);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
} }
// totalSAddTime += timer.report() / 1000.0; // totalSAddTime += timer.report() / 1000.0;
......
...@@ -16,17 +16,22 @@ ...@@ -16,17 +16,22 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cstdlib> #include <cstdlib>
#include <cuda_runtime_api.h>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
// #include <prettyprint.h> // #include <prettyprint.h>
#include <sstream> #include <sstream>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#ifdef SPCONV_CUDA
#include <cuda_runtime_api.h>
#endif
namespace tv { namespace tv {
#ifdef __NVCC__ #ifdef __NVCC__
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__ #define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__ #define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__ #define TV_HOST_DEVICE __device__ __host__
...@@ -113,12 +118,13 @@ void sstream_print(SStream &ss, T val, TArgs... args) { ...@@ -113,12 +118,13 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
} \ } \
} }
#ifdef SPCONV_CUDA
struct GPU { struct GPU {
GPU(cudaStream_t s = 0) : mStream(s) {} GPU(cudaStream_t s = 0) : mStream(s) {}
virtual cudaStream_t getStream() const { return mStream; } virtual cudaStream_t getStream() const { return mStream; }
cudaStream_t mStream = 0; cudaStream_t mStream = 0;
}; };
#endif
struct CPU {}; struct CPU {};
#define TV_MAX_DIM 6 #define TV_MAX_DIM 6
......
...@@ -16,16 +16,19 @@ ...@@ -16,16 +16,19 @@
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <torch/script.h> #include <torch/script.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#ifdef SPCONV_CUDA
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#endif
namespace tv { namespace tv {
#ifdef SPCONV_CUDA
struct TorchGPU: public tv::GPU { struct TorchGPU: public tv::GPU {
virtual cudaStream_t getStream() const override { virtual cudaStream_t getStream() const override {
return at::cuda::getCurrentCUDAStream(); return at::cuda::getCurrentCUDAStream();
} }
}; };
#endif
template <typename T> void check_torch_dtype(const torch::Tensor &tensor) { template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
switch (tensor.type().scalarType()) { switch (tensor.type().scalarType()) {
case at::ScalarType::Double: { case at::ScalarType::Double: {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment