Commit 47527b00 authored by traveller59's avatar traveller59
Browse files

v1.1 alpha: add cuda hash implementation

parent 5df97387
...@@ -13,7 +13,7 @@ if(WIN32) # true if windows (32 and 64 bit) ...@@ -13,7 +13,7 @@ if(WIN32) # true if windows (32 and 64 bit)
else() else()
set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64") set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
endif() endif()
# set(CMAKE_VERBOSE_MAKEFILE ON)
find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS}) find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS}) find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
...@@ -22,6 +22,10 @@ endif() ...@@ -22,6 +22,10 @@ endif()
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
add_compile_definitions(SPCONV_CUDA) add_compile_definitions(SPCONV_CUDA)
...@@ -35,6 +39,7 @@ set(ALL_INCLUDE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ...@@ -35,6 +39,7 @@ set(ALL_INCLUDE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
add_subdirectory(src/spconv) add_subdirectory(src/spconv)
add_subdirectory(src/utils) add_subdirectory(src/utils)
add_subdirectory(src/hash)
if (SPCONV_BuildTests) if (SPCONV_BuildTests)
include(CTest) #adds option BUILD_TESTING (default ON) include(CTest) #adds option BUILD_TESTING (default ON)
......
...@@ -8,6 +8,10 @@ The GPU Indice Generation algorithm is a unofficial implementation of paper [SEC ...@@ -8,6 +8,10 @@ The GPU Indice Generation algorithm is a unofficial implementation of paper [SEC
This project only support CUDA 9.0+. If you are using cuda 8.0, please update it to 9.0. This project only support CUDA 9.0+. If you are using cuda 8.0, please update it to 9.0.
## News:
2019-5-22: spconv v1.1 alpha released, now cuda hash implementation will be default. you can use ```use_hash=False``` to use dense implementation. you may see some message during running, they will be removed in future.
## Install on Ubuntu 16.04/18.04 ## Install on Ubuntu 16.04/18.04
0. Use ```git clone xxx.git --recursive``` to clone this repo. 0. Use ```git clone xxx.git --recursive``` to clone this repo.
......
#ifndef _CUDA_UTIL_H_
#define _CUDA_UTIL_H_
#if CUDART_VERSION >= 4000
#define CUDA_DEVICE_SYNCHRONIZE( ) cudaDeviceSynchronize();
#else
#define CUDA_DEVICE_SYNCHRONIZE( ) cudaThreadSynchronize();
#endif
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
//! Check for CUDA error
#ifdef _DEBUG
# define CUDA_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = CUDA_DEVICE_SYNCHRONIZE(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#else
# define CUDA_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#endif
#endif
\ No newline at end of file
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.h
*
* @brief Debugging/statistics/performance utilities header for hash tables.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
#define CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
#include "definitions.h"
#include <cuda_runtime_api.h>
#include <vector_types.h>
#include <algorithm>
namespace cudahash {
//! @name Debugging functions
/// @{
void TakeHashFunctionStatistics(const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size,
const uint2 *constants,
const unsigned kNumHashFunctions);
//! Output how many probes were required by each thread to perform the retrieval.
/*! @param[in] n_queries Number of queries being performed.
* @param[in] d_retrieval_probes Device array: the number of probes taken for each thread's retrieval.
* @param[in] n_functions Number of hash functions used.
*/
void OutputRetrievalStatistics(const unsigned n_queries,
const unsigned *d_retrieval_probes,
const unsigned n_functions);
//! Outputs information about how many iterations threads required to successfully cuckoo hash.
/*! @param[in] n Number of keys in the input.
* @param[in] d_iterations_taken Device mem: Number of iterations each thread took.
* @param[in] d_max_iterations_taken Device mem: Largest number of iterations taken by any thread.
*/
void OutputBuildStatistics(const unsigned n,
const unsigned *d_iterations_taken);
//! Prints out the contents of the stash.
void PrintStashContents(const Entry *d_stash);
//! Checks if a key is assigned the same slot by different hash functions.
bool CheckAssignedSameSlot(const unsigned N,
const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size,
uint2 *constants);
/// @}
}; // namespace CuckooHashing
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file definitions.h
*
* @brief Stores configuration flags and definitions for hard-coded values in
* hash table implementations.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
#define CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
#include <cstdio>
#include <limits>
#include <tensorview/tensorview.h>
/* --------------------------------------------------------------------------
Debugging.
-------------------------------------------------------------------------- */
#ifdef _DEBUG
//! Forces the hash functions to generate a full set of slots for each key when
//! not using subtables.
// #define FORCEFULLY_GENERATE_NO_CYCLES
//! Count how many iterations are taken to insert/find items.
#define TRACK_ITERATIONS
//! Count how many items fail to be inserted when the hash table fails to build.
#define COUNT_UNINSERTED
//! Take some statistics on the hash functions.
#define TAKE_HASH_FUNCTION_STATISTICS
#ifdef TAKE_HASH_FUNCTION_STATISTICS
//! Determine how many keys hash into each table slot.
#define COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
//! Determine how many unique slots a key is assigned.
#define COUNT_HOW_MANY_HAVE_CYCLES
#endif
#endif
#ifdef USE_DAN_OUTPUT
#include <Utilities/output.h>
//! Logs any error messages.
inline void PrintMessage(const char *message, const bool error = false) {
PrintIndentedMessage(message, error);
}
#else
//! Prints a message out to the console.
inline void PrintMessage(const char *message, const bool error = false) {
if (error) {
printf("!!! %s\n", message);
} else {
printf("%s\n", message);
}
}
#endif
/* -------------------------------------------------------------------------
Hash table constants and definitions.
------------------------------------------------------------------------- */
namespace cudahash {
/**
* \addtogroup cudpp_hash_data_structures
*
* @{
*/
typedef unsigned long long
Entry; //!< A key and its value are stored in a 64-bit number. The key is
//!< stored in the upper 32 bits.
const unsigned kMaxRestartAttempts = 10; //!< Number of build attempts.
const unsigned kKeyEmpty = 0xffffffffu; //!< Signifies empty slots in the table.
const unsigned kNotFound =
0xffffffffu; //!< Signifies that a query key was not found.
const unsigned kMaxHashFunctions =
5; //!< Maximum number of hash functions allowed.
const unsigned kStashSize =
101; //!< How many slots the stash hash table contains.
//! Value indicating that a hash table slot has no valid item within it.
const Entry kEntryEmpty = Entry(kKeyEmpty) << 32;
//! Value returned when a query fails.
const Entry kEntryNotFound = (Entry(kKeyEmpty) << 32) + kNotFound;
//! Number of threads to put in a thread block.
const unsigned kBlockSize = 64;
//! Number of blocks to put along each axis of the grid.
const unsigned kGridSize = 16384;
//! Minimum table sizes for 2 through 5 functions.
const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(),
std::numeric_limits<float>::max(),
2.01f,
1.1f,
1.03f,
1.02f};
/** @} */ // end cudpp_hash_data_structures
}; // namespace cudahash
#endif
/*! @file hash_functions.h
* @brief Hash function code.
*/
#ifndef HASH_FUNCTIONS__H
#define HASH_FUNCTIONS__H
#include <tensorview/tensorview.h>
#include <vector_types.h>
#include "definitions.h"
namespace cudahash {
//! Prime number larger than the largest practical hash table size.
const unsigned kPrimeDivisor = 4294967291u;
// https://www.alpertron.com.ar/ECM.HTM
// const unsigned long kPrimeDivisor = 18446744073709551557lu
// const long kPrimeDivisor = 9223372036854775783l
// const Entry kPrimeDivisor = 4300000013lu;
// const unsigned kPrimeDivisor = 334214459;
//! Generates a set of linear hash function constants.
/*! @param[in] N Number of hash functions.
@param[out] constants CPU pointer to the constants.
@param[in] num_keys Debug only: How many keys are in the input.
@param[in] d_keys Debug only: Device memory array containing the input keys.
@param[in] table_size Debug only: Size of the hash table.
*/
void GenerateFunctions(const unsigned N,
const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size,
uint2 *constants);
//! Container for all of the hash functions.
template <unsigned N>
struct Functions {
//! The constants required for all of the hash functions, including the stash. Each function requires 2.
uint2 constants[N];
//! Generate new hash function constants.
/*! The parameters are only used for debugging and examining the key distribution.
\param[in] num_keys Debug: Number of keys in the input.
\param[in] d_keys Debug: Device array of the input keys.
\param[in] table_size Debug: Size of the hash table.
*/
void Generate(const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size) {
GenerateFunctions(N, num_keys, d_keys, table_size, constants);
}
};
//! Computes the value of a hash function for a given key.
/*! \param[in] constants Constants used by the hash function.
! \param[in] key Key being hashed.
! \returns The value of the hash function for the key.
*/
inline __device__ __host__
unsigned hash_function_inner(const uint2 constants,
const unsigned key) {
#if 1
// Fast version.
return ((constants.x ^ key) + constants.y) % kPrimeDivisor;
#else
// Slow version.
return ((unsigned long long)constants.x * key + constants.y) % kPrimeDivisor;
#endif
}
//! Computes the value of a hash function for a given key.
/*! \param[in] functions All of the constants used by the hash functions.
! \param[in] which_function Which hash function is being used.
! \param[in] key Key being hashed.
! \returns The value of a hash function with a given key.
*/
template <unsigned kNumHashFunctions>
TV_HOST_DEVICE_INLINE
unsigned hash_function(const Functions<kNumHashFunctions> functions,
const unsigned which_function,
const unsigned key) {
return hash_function_inner(functions.constants[which_function], key);
}
//! Simple hash function used by the stash.
TV_HOST_DEVICE_INLINE
unsigned stash_hash_function(const uint2 stash_constants,
const unsigned key) {
return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
}
}; // namespace CuckooHashing
#endif
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cuh
*
* @brief Implements kernel and __device__ functions for a basic hash table.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
#include "definitions.h"
#include "hash_table.h"
#include <tensorview/tensorview.h>
#include <driver_types.h>
namespace cudahash {
//! Makes an 64-bit Entry out of a key-value pair for the hash table.
TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {
return (Entry(key) << 32) + value;
}
//! Returns the key of an Entry.
TV_HOST_DEVICE_INLINE unsigned get_key(Entry entry) {
return (unsigned) (entry >> 32);
}
//! Returns the value of an Entry.
TV_HOST_DEVICE_INLINE unsigned get_value(Entry entry) {
return (unsigned) (entry & 0xffffffff);
}
//! @name Internal
//! @brief Functions used for building the hash table.
//! @{
//! Fills the entire array with a specific value.
template <class T> __global__
void clear_table(const unsigned table_size,
const T value,
T *table)
{
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if (thread_index < table_size) {
table[thread_index] = value;
}
}
//! Determine where in the hash table the key could be located.
template <unsigned kNumHashFunctions>
__device__ void
KeyLocations(const Functions<kNumHashFunctions> constants,
const unsigned table_size,
const unsigned key,
unsigned locations[kNumHashFunctions])
{
// Compute all possible locations for the key in the big table.
#pragma unroll
for (int i = 0; i < kNumHashFunctions; ++i) {
locations[i] = hash_function(constants, i, key) % table_size;
}
}
//! @}
/* --------------------------------------------------------------------------
Retrieval functions.
-------------------------------------------------------------------------- */
//! Answers a single query.
/*! @ingroup PublicInterface
* @param[in] key Query key
* @param[in] table_size Size of the hash table
* @param[in] table The contents of the hash table
* @param[in] constants The hash functions used to build the table
* @param[in] stash_constants The hash function used to build the stash
* @param[in] stash_count The number of items in the stash
* @param[out] num_probes_required Debug only: The number of probes required to resolve the query.
* @returns The value of the query key, if the key exists in the table. Otherwise, \ref kNotFound will be returned.
*/
template <unsigned kNumHashFunctions> __device__
unsigned retrieve(const unsigned query_key,
const unsigned table_size,
const Entry *table,
const Functions<kNumHashFunctions> constants,
const uint2 stash_constants,
const unsigned stash_count,
unsigned *num_probes_required = NULL)
{
// Identify all of the locations that the key can be located in.
unsigned locations[kNumHashFunctions];
KeyLocations(constants, table_size, query_key, locations);
// Check each location until the key is found.
unsigned num_probes = 1;
Entry entry = table[locations[0]];
unsigned key = get_key(entry);
#pragma unroll
for (unsigned i = 1; i < kNumHashFunctions; ++i) {
if (key != query_key && key != kNotFound) {
num_probes++;
entry = table[locations[i]];
key = get_key(entry);
}
}
// Check the stash.
if (stash_count && get_key(entry) != query_key) {
num_probes++;
const Entry *stash = table + table_size;
unsigned slot = stash_hash_function(stash_constants, query_key);
entry = stash[slot];
}
#ifdef TRACK_ITERATIONS
if (num_probes_required) {
*num_probes_required = num_probes;
}
#endif
if (get_key(entry) == query_key) {
return get_value(entry);
} else {
return kNotFound;
}
}
//! Perform a retrieval from a basic hash table. Each thread manages a single query.
template <unsigned kNumHashFunctions> __global__
void hash_retrieve(const unsigned n_queries,
const unsigned *keys_in,
const unsigned table_size,
const Entry *table,
const Functions<kNumHashFunctions> constants,
const uint2 stash_constants,
const unsigned stash_count,
unsigned *values_out,
unsigned *num_probes_required = NULL)
{
// Get the key.
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if (thread_index >= n_queries)
return;
unsigned key = keys_in[thread_index];
values_out[thread_index] = retrieve<kNumHashFunctions>
(key,
table_size,
table,
constants,
stash_constants,
stash_count,
(num_probes_required ? num_probes_required + thread_index : NULL));
}
/* --------------------------------------------------------------------------
Build a cuckoo hash table.
-------------------------------------------------------------------------- */
//! @name Internal
//! @{
//! Determine where to insert the key next. The hash functions are used in round-robin order.
template <unsigned kNumHashFunctions> __device__
unsigned determine_next_location(const Functions<kNumHashFunctions> constants,
const unsigned table_size,
const unsigned key,
const unsigned previous_location) {
// Identify all possible locations for the entry.
unsigned locations[kNumHashFunctions];
#pragma unroll
for (unsigned i = 0; i < kNumHashFunctions; ++i) {
locations[i] = hash_function(constants, i, key) % table_size;
}
// Figure out where the item should be inserted next.
unsigned next_location = locations[0];
#pragma unroll
for (int i = kNumHashFunctions - 2; i >= 0; --i) {
next_location = (previous_location == locations[i] ? locations[i+1]
: next_location);
}
return next_location;
}
//! Attempts to insert a single entry into the hash table.
/*! This process stops after a certain number of iterations. If the thread is
still holding onto an item because of an eviction, it tries the stash.
If it fails to enter the stash, it returns false.
Otherwise, it succeeds and returns true.
*/
template <unsigned kNumHashFunctions> __device__
bool insert(const unsigned table_size,
const Functions<kNumHashFunctions> constants,
const uint2 stash_constants,
const unsigned max_iteration_attempts,
Entry *table,
unsigned *stash_count,
Entry entry,
unsigned *iterations_used) {
unsigned key = get_key(entry);
// The key is always inserted into its first slot at the start.
unsigned location = hash_function(constants, 0, key) % table_size;
// Keep inserting until an empty slot is found or the eviction chain grows too large.
for (unsigned its = 1; its <= max_iteration_attempts; its++) {
// Insert the new entry.
entry = atomicExch(&table[location], entry);
key = get_key(entry);
// If no key was evicted, we're done.
if (key == kKeyEmpty) {
*iterations_used = its;
break;
}
// Otherwise, determine where the evicted key will go.
location = determine_next_location(constants, table_size, key, location);
}
if (key != kKeyEmpty) {
// Shove it into the stash.
unsigned slot = stash_hash_function(stash_constants, key);
Entry *stash = table + table_size;
Entry replaced_entry = atomicCAS(stash + slot, kEntryEmpty, entry);
if (replaced_entry != kEntryEmpty) {
return false;
} else {
atomicAdd(stash_count, 1);
}
}
return true;
}
// Build a basic hash table, using one big table.
template <unsigned kNumHashFunctions> __global__
void CuckooHash(const unsigned n_entries,
const unsigned *keys,
const unsigned *values,
const unsigned table_size,
const Functions<kNumHashFunctions> constants,
const unsigned max_iteration_attempts,
Entry *table,
uint2 stash_constants,
unsigned *stash_count,
unsigned *failures,
unsigned *iterations_taken = nullptr) {
// Check if this thread has an item and if any previous threads failed.
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if (thread_index >= n_entries || *failures)
return;
Entry entry = make_entry(keys[thread_index], values[thread_index]);
unsigned iterations = 0;
bool success = insert<kNumHashFunctions>
(table_size, constants, stash_constants,
max_iteration_attempts, table, stash_count, entry, &iterations);
if (success == false) {
// The eviction chain grew too large. Report failure.
#ifdef COUNT_UNINSERTED
atomicAdd(failures, 1);
#else
*failures = 1;
#endif
}
#ifdef TRACK_ITERATIONS
iterations_taken[thread_index] = iterations;
#endif
}
//! @}
}; // namespace CuckooHashing
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.h
*
* @brief Header for a basic hash table that stores one value per key.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#include "definitions.h"
#include "hash_functions.h"
#include <cstdio>
/** \addtogroup cudpp_app
* @{
*/
/** \addtogroup cudpp_hash_data_structures
* @{
*/
/* --------------------------------------------------------------------------
Doxygen definitions.
-------------------------------------------------------------------------- */
/*! @namespace CudaHT
* @brief Encapsulates the hash table library.
*/
/*! @namespace CuckooHashing
* @brief Encapsulates the cuckoo hash table that uses stashes.
*/
/* -------------------------------------------------------------------------
Hash table code.
------------------------------------------------------------------------- */
namespace cudahash {
//! Compute how many thread blocks are required for the given number of threads.
dim3 ComputeGridDim(unsigned threads);
//! Compute how long an eviction chain is allowed to become for a given input size.
/*! \param[in] num_keys Number of keys in the input.
* \param[in] table_size Number of slots in the hash table.
* \param[in] num_functions Number of hash functions being used.
* \returns The number of iterations that should be allowed.
*
* The latter two parameters are only needed when using an empirical
* formula for computing the chain length.
*/
unsigned ComputeMaxIterations(const unsigned num_keys,
const unsigned table_size,
const unsigned num_functions);
//! Basic hash table that stores one value for each key.
/*! The input consists of two unsigned arrays of keys and values.
* None of the keys are expected to be repeated.
*
* @todo Templatize the interface without forcing the header file to
* have CUDA calls.
* @ingroup cudpp_app
*/
class HashTable {
public:
HashTable();
virtual ~HashTable() {Release();}
//! Initialize the hash table's memory. Must be called before \ref
//! Build() and after the random number generator has been seeded.
/*! @param[in] max_input_size Largest expected number of items in the input.
* @param[in] space_usage Size of the hash table relative to the
* input. Bigger tables are faster to build
* and retrieve from.
* @param[in] num_functions Number of hash functions to use. May be
* 2-5. More hash functions make it easier
* to build the table, but increase
* retrieval times.
* @returns Whether the hash table was initialized successfully (true)
* or not (false).
*
* The minimum space usage is dependent on the number of functions
* being used; for two through five functions, the minimum space
* usage is 2.1, 1.1, 1.03, and 1.02 respectively.
*/
virtual bool Initialize(const unsigned max_input_size,
const float space_usage = 1.25,
const unsigned num_functions = 4);
//! Free all memory.
virtual void Release();
//! Build the hash table.
/*! @param[in] input_size Number of key-value pairs being inserted.
* @param[in] d_keys Device memory array containing all of the input
* keys.
* @param[in] d_vals Device memory array containing the keys' values.
* @returns Whether the hash table was built successfully (true) or
* not (false).
*
* Several attempts are allowed to build the hash table in case of failure.
* The input keys are expected to be completely unique.
* To reduce the chance of a failure, increase the space usage or number of
* functions.
* Keys are not allowed to be equal to cudahash::kKeyEmpty.
*/
virtual bool Build(const unsigned input_size,
const unsigned *d_keys,
const unsigned *d_vals);
//! Query the hash table.
/*! @param[in] n_queries Number of keys in the query set.
* @param[in] d_query_keys Device memory array containing all of
* the query keys.
* @param[in] d_query_results Values for the query keys.
*
* kNotFound is returned for any query key that failed to be found
* in the table.
*/
virtual void Retrieve(const unsigned n_queries,
const unsigned *d_query_keys,
unsigned *d_query_results);
//! @name Accessors
/// @brief Mainly needed to use the __device__ CudaHT::retrieve()
/// function directly.
/// @{
//! Returns how many slots the hash table has.
inline unsigned get_table_size() const {return table_size_;}
//! Returns how many items are stored in the stash.
inline unsigned get_stash_count() const {return stash_count_;}
//! Returns the constants used by the stash.
inline uint2 get_stash_constants() const {return stash_constants_;}
//! Returns the hash table contents.
inline const Entry* get_contents() const {return d_contents_;}
//! Returns the number of hash functions being used.
inline unsigned get_num_hash_functions() const {return
num_hash_functions_;}
//! When using two hash functions, returns the constants.
inline Functions<2> get_constants_2() const {return constants_2_;}
//! When using three hash functions, returns the constants.
inline Functions<3> get_constants_3() const {return constants_3_;}
//! When using four hash functions, returns the constants.
inline Functions<4> get_constants_4() const {return constants_4_;}
//! When using five hash functions, returns the constants.
inline Functions<5> get_constants_5() const {return constants_5_;}
/// @}
inline Entry * data(){return d_contents_;}
inline const Entry * data() const {return d_contents_;}
protected:
unsigned table_size_; //!< Size of the hash table.
unsigned num_hash_functions_; //!< Number of hash functions being used.
Entry *d_contents_; //!< Device memory: The hash table contents. The stash is stored at the end.
unsigned stash_count_; //!< Number of key-value pairs currently stored.
uint2 stash_constants_; //!< Hash function constants for the stash.
Functions<2> constants_2_; //!< Constants for a set of two hash functions.
Functions<3> constants_3_; //!< Constants for a set of three hash functions.
Functions<4> constants_4_; //!< Constants for a set of four hash functions.
Functions<5> constants_5_; //!< Constants for a set of five hash functions.
unsigned *d_failures_; //!< Device memory: General use error flag.
};
/*! @name Internal
* @{
*/
namespace CUDAWrapper {
//! Fills a 64-bit array with a particular value.
void ClearTable(const unsigned slots_in_table,
const Entry fill_value,
Entry *d_array);
//! Calls the Cuckoo Hash construction kernel.
void CallCuckooHash(const unsigned n_entries,
const unsigned num_hash_functions,
const unsigned *d_keys,
const unsigned *d_values,
const unsigned table_size,
const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const unsigned max_iteration_attempts,
Entry *d_contents,
uint2 stash_constants,
unsigned *d_stash_count,
unsigned *d_failures,
unsigned *d_iterations_taken);
//! Calls the kernel that performs retrievals.
void CallHashRetrieve(const unsigned n_queries,
const unsigned num_hash_functions,
const unsigned *keys_in,
const unsigned table_size,
const Entry *table,
const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const uint2 stash_constants,
const unsigned stash_count,
unsigned *values_out);
};
/// @}
}; // namespace CuckooHashing
/** @} */ // end hash table data structures
/** @} */ // end cudpp_app
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
void init_genrand(unsigned long s);
void init_by_array(unsigned long init_key[], int key_length);
unsigned long genrand_int32(void);
long genrand_int31(void);
double genrand_real1(void);
double genrand_real2(void);
double genrand_real3(void);
double genrand_res53(void);
...@@ -271,13 +271,14 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn, ...@@ -271,13 +271,14 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
std::vector<Index> validPoints_(kernelVolume * (NDim + 1)); std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index* validPoints = validPoints_.data(); Index* validPoints = validPoints_.data();
Index *pointPtr = nullptr; Index *pointPtr = nullptr;
Index index = 0;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1, index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) + outSpatialShape) +
spatialVolume * indicesIn(j, 0); spatialVolume * indicesIn(j, 0);
gridsOut[index] = j; gridsOut[index] = j;
} }
Index index = 0;
for (int j = 0; j < numActIn; ++j) { for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>( numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
......
...@@ -14,9 +14,10 @@ ...@@ -14,9 +14,10 @@
#ifndef INDICE_CU_H_ #ifndef INDICE_CU_H_
#define INDICE_CU_H_ #define INDICE_CU_H_
#include <tensorview/tensorview.h> #include <hash/hash_table.cuh>
#include <tensorview/helper_kernel.cu.h>
#include <spconv/geometry.h> #include <spconv/geometry.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/tensorview.h>
namespace spconv { namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim, template <typename Index, typename IndexGrid, unsigned NDim,
...@@ -127,6 +128,49 @@ __global__ void assignGridAndIndiceOutKernel( ...@@ -127,6 +128,49 @@ __global__ void assignGridAndIndiceOutKernel(
} }
} }
template <typename Index, unsigned NDim,
unsigned kNumHashFunctions = 4>
__global__ void assignIndiceOutKernel(
tv::TensorView<Index> indicesOut, int numAct,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
index = indicePairUnique[ix];
index = tv::rowArrayIdxInv<Index, NDim>(
index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
template <typename Index, typename IndexGrid, unsigned NDim,
unsigned kNumHashFunctions = 4>
__global__ void
assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
unsigned table_size, const cudahash::Entry *table,
cudahash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) {
Index index;
int kernelVolume = indicePairs.dim(0);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairs(i, 1, ix);
if (index > -1) {
auto val =
cudahash::retrieve((unsigned)(index), table_size,
table, constants, stash_constants, stash_count);
assert(val != cudahash::kNotFound);
indicePairs(i, 1, ix) = (unsigned)val;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void __global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut, assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
...@@ -150,8 +194,8 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut, ...@@ -150,8 +194,8 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void __global__ void
prepareSubMGridKernel(tv::TensorView<const Index> indicesIn, prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape) { const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1; Index spatialVolume = 1;
#pragma unroll #pragma unroll
...@@ -167,6 +211,29 @@ prepareSubMGridKernel(tv::TensorView<const Index> indicesIn, ...@@ -167,6 +211,29 @@ prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
} }
} }
template <typename Index, unsigned NDim>
__global__ void
prepareSubMHashKernel(tv::TensorView<const Index> indicesIn,
unsigned* keys,
unsigned* values,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
keys[ix] = index;
values[ix] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim, template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256> int KernelMaxVolume = 256>
__global__ void getSubMIndicePairsKernel( __global__ void getSubMIndicePairsKernel(
...@@ -206,6 +273,52 @@ __global__ void getSubMIndicePairsKernel( ...@@ -206,6 +273,52 @@ __global__ void getSubMIndicePairsKernel(
} }
} }
template <typename Index, unsigned NDim,
int KernelMaxVolume = 256, unsigned kNumHashFunctions=4>
__global__ void getSubMIndicePairsHashKernel(
tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
unsigned table_size, const cudahash::Entry *table,
cudahash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
auto val =
cudahash::retrieve((unsigned)(index), table_size,
table, constants, stash_constants, stash_count);
if (val != cudahash::kNotFound) {
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(offset, 1, oldNum) = val;
indicePairs(offset, 0, oldNum) = ix;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique, __global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
...@@ -215,6 +328,14 @@ __global__ void resetGridKernel(const Index *indicePairUnique, ...@@ -215,6 +328,14 @@ __global__ void resetGridKernel(const Index *indicePairUnique,
} }
} }
template <typename T>
__global__ void arangeKernel(T *data, int size) {
for (int ix : tv::KernelLoopX<int>(size)) {
data[ix] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void __global__ void
resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut, resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
......
...@@ -44,7 +44,7 @@ struct CreateConvIndicePairFunctorP2 ...@@ -44,7 +44,7 @@ struct CreateConvIndicePairFunctorP2
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique, tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose,
bool resetGrid=false); bool resetGrid=false, bool useHash=true);
}; };
template <typename Device, typename Index, typename IndexGrid, unsigned NDim> template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
...@@ -58,7 +58,8 @@ struct CreateConvIndicePairFunctor ...@@ -58,7 +58,8 @@ struct CreateConvIndicePairFunctor
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false); const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false,
bool useHash=true);
}; };
template <typename Device, typename Index, typename IndexGrid, unsigned NDim> template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
...@@ -71,7 +72,8 @@ struct CreateSubMIndicePairFunctor ...@@ -71,7 +72,8 @@ struct CreateSubMIndicePairFunctor
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false); const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false,
bool useHash=true);
}; };
} // namespace functor } // namespace functor
} // namespace spconv } // namespace spconv
......
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv
{
namespace functor
{
template <typename Device, typename T, typename Index>
struct PointPillarScatter
{
void operator()(const Device& d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/pillar_scatter_functor.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::Tensor shape) {
TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
tv::check_torch_dtype<int>(shape);
tv::check_torch_dtype<T>(coors);
auto shapeData = shape.data<int>();
torch::Tensor canvas =
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options());
TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
functor::PointPillarScatter<tv::GPU, T, int> ftor;
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()),
tv::torch2tv<const T>(coors.squeeze()));
return canvas;
}
} // namespace spconv
#endif
\ No newline at end of file
...@@ -29,7 +29,7 @@ using namespace pybind11::literals; ...@@ -29,7 +29,7 @@ using namespace pybind11::literals;
template <typename DType, int NDim> template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels, int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> coors, py::array_t<DType> voxel_point_mask, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> voxel_size,
...@@ -37,6 +37,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels, ...@@ -37,6 +37,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
int max_voxels) { int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>(); auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>(); auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>(); auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>(); auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
...@@ -79,6 +80,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels, ...@@ -79,6 +80,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
} }
num = num_points_per_voxel_rw(voxelidx); num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) { if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k); voxels_rw(voxelidx, num, k) = points_rw(i, k);
} }
...@@ -87,23 +89,23 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels, ...@@ -87,23 +89,23 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
} }
for (int i = 0; i < voxel_num; ++i) { for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1; coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
} }
return voxel_num; return voxel_num;
} }
template <typename DType, int NDim> template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels, int points_to_voxel_3d_np_mean(py::array_t<DType> points,
py::array_t<DType> means, py::array_t<DType> voxel_point_mask, py::array_t<DType> voxels,
py::array_t<int> coors, py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points, std::vector<DType> coors_range, int max_points,
int max_voxels) { int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>(); auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>(); auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>(); auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>(); auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>(); auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
...@@ -146,19 +148,21 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox ...@@ -146,19 +148,21 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
} }
num = num_points_per_voxel_rw(voxelidx); num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) { if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k); voxels_rw(voxelidx, num, k) = points_rw(i, k);
} }
num_points_per_voxel_rw(voxelidx) += 1; num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1); means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
} }
} }
} }
for (int i = 0; i < voxel_num; ++i) { for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1; coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i); num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j){ for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k); voxels_rw(i, j, k) = means_rw(i, k);
} }
...@@ -168,165 +172,19 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox ...@@ -168,165 +172,19 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
} }
template <typename DType, int NDim> template <typename DType, int NDim>
int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels, int points_to_voxel_3d_with_filtering(
py::array_t<DType> height, py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> maxs, py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask, py::array_t<DType> mins,
py::array_t<int> coors, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
py::array_t<int> coor_to_voxelidx, std::vector<DType> voxel_size, std::vector<DType> coors_range,
std::vector<DType> voxel_size, int max_points, int max_voxels, int block_factor, int block_size,
std::vector<DType> coors_range, int max_points, DType height_threshold, DType height_high_threshold) {
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
break;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
for (int k = 0; k < num_features; ++k) {
height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
}
}
return voxel_num;
}
template <typename DType, int NDim>
int block_filtering(py::array_t<DType> points,
py::array_t<int> mask,
py::array_t<DType> height,
py::array_t<DType> maxs,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range,
int max_voxels,
DType eps) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<1>();
auto maxs_rw = maxs.template mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
}
height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
}
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
mask(i) = 0;
}
}
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> voxel_mask,
py::array_t<DType> mins,
py::array_t<DType> maxs,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range,
int max_points,
int max_voxels,
int block_factor,
int block_size,
DType height_threshold) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>(); auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>(); auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>(); auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>(); auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>(); auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>(); auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
...@@ -340,7 +198,7 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -340,7 +198,7 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
int coor[NDim]; int coor[NDim];
int c; int c;
int grid_size[NDim]; int grid_size[NDim];
DType max_value, min_value; DType max_value, min_value;
for (int i = 0; i < NDim; ++i) { for (int i = 0; i < NDim; ++i) {
grid_size[i] = grid_size[i] =
...@@ -376,13 +234,16 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -376,13 +234,16 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
} }
num = num_points_per_voxel_rw(voxelidx); num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) { if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k); voxels_rw(voxelidx, num, k) = points_rw(i, k);
} }
block_coor[0] = coor[1] / block_factor; block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor; block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1])); mins_rw(block_coor[0], block_coor[1]) =
maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1])); std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1; num_points_per_voxel_rw(voxelidx) += 1;
} }
} }
...@@ -394,21 +255,23 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -394,21 +255,23 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
block_coor[1] = coor[2] / block_factor; block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]); min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]); max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0]-block_size/2); startx = std::max(0, block_coor[0] - block_size / 2);
stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2); stopx =
starty = std::max(0, block_coor[1]-block_size/2); std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2); starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j){ for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k){ for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k)); min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k)); max_value = std::max(max_value, maxs_rw(j, k));
} }
} }
voxel_mask_rw(i) = (max_value - min_value) > height_threshold; voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
((max_value - min_value) < height_high_threshold);
} }
return voxel_num; return voxel_num;
} }
} // namespace spconv } // namespace spconv
\ No newline at end of file
...@@ -30,10 +30,12 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -30,10 +30,12 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape, std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) { std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0; bool subM = _subM != 0;
bool transpose = _transpose != 0; bool transpose = _transpose != 0;
bool useHash = _useHash != 0;
auto numAct = indices.size(0); auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error"); TV_ASSERT_RT_ERR(NDim == coorDim, "error");
...@@ -52,13 +54,20 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -52,13 +54,20 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
for (int i = 1; i < outSpatialShape.size(); ++i) { for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i]; outputVolume *= outSpatialShape[i];
} }
std::string msg = "due to limits of cuda hash, the volume of dense space include batch size ";
msg += "must less than std::numeric_limits<int>::max()";
TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(), msg);
torch::Tensor indicePairs = torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1, torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros( torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device())); {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
auto gridSize = batchSize * outputVolume;
if (useHash){
gridSize = 1;
}
torch::Tensor gridOut = torch::Tensor gridOut =
torch::full({batchSize * outputVolume}, -1, torch::full({gridSize}, -1,
torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl; // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1; int64_t numActOut = -1;
...@@ -90,14 +99,14 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -90,14 +99,14 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose); stride32, padding32, dilation32, outSpatialShape32, transpose, useHash);
} else { } else {
auto getIndicePairFtor = auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>(); functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor( numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut), tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32, tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
stride32, padding32, dilation32, outSpatialShape32, transpose); stride32, padding32, dilation32, outSpatialShape32, transpose, useHash);
} }
return {indices, indicePairs, indiceNum}; return {indices, indicePairs, indiceNum};
} else { } else {
...@@ -129,7 +138,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize, ...@@ -129,7 +138,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut), tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose); tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, useHash);
} }
} }
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum}; return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
...@@ -142,10 +151,12 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch ...@@ -142,10 +151,12 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape, std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride, std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation, std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) { std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) {
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0; bool subM = _subM != 0;
bool transpose = _transpose != 0; bool transpose = _transpose != 0;
bool useHash = _useHash != 0;
TV_ASSERT_RT_ERR(!useHash, "error");
auto numAct = indices.size(0); auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error"); TV_ASSERT_RT_ERR(NDim == coorDim, "error");
......
...@@ -91,15 +91,29 @@ void sstream_print(SStream &ss, T val, TArgs... args) { ...@@ -91,15 +91,29 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
#define TV_CHECK_CUDA_ERR() \ #define TV_CHECK_CUDA_ERR() \
{ \ { \
auto err = cudaGetLastError(); \ auto __macro_err = cudaGetLastError(); \
if (err != cudaSuccess) { \ if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \ std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \ __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \ __macro_s << "cuda execution failed with error " << __macro_err; \
throw std::runtime_error(__macro_s.str()); \ throw std::runtime_error(__macro_s.str()); \
} \ } \
} }
#define TV_CHECK_CUDA_ERR_V2(...) \
{ \
auto __macro_err = cudaGetLastError(); \
if (__macro_err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << __macro_err; \
__macro_s << " " << cudaGetErrorString(__macro_err) << "\n";\
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
struct GPU { struct GPU {
GPU(cudaStream_t s = 0) : mStream(s) {} GPU(cudaStream_t s = 0) : mStream(s) {}
virtual cudaStream_t getStream() const { return mStream; } virtual cudaStream_t getStream() const { return mStream; }
......
...@@ -58,6 +58,31 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) { ...@@ -58,6 +58,31 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
} }
} }
template <typename T>
constexpr auto type2torch(T val=T()) -> decltype(torch::kInt32){
TV_ASSERT_RT_ERR(false, "unknown type");
}
template <>
constexpr auto type2torch(int val) -> decltype(torch::kInt32){
return torch::kInt32;
}
template <>
constexpr auto type2torch(long val) -> decltype(torch::kInt32){
return torch::kInt64;
}
template <>
constexpr auto type2torch(float val) -> decltype(torch::kInt32){
return torch::kFloat32;
}
template <>
constexpr auto type2torch(double val) -> decltype(torch::kInt32){
return torch::kFloat64;
}
template <typename T> template <typename T>
tv::TensorView<T> torch2tv(const torch::Tensor &tensor) { tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
check_torch_dtype<T>(tensor); check_torch_dtype<T>(tensor);
......
...@@ -45,7 +45,8 @@ class CMakeBuild(build_ext): ...@@ -45,7 +45,8 @@ class CMakeBuild(build_ext):
'-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT), '-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
'-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION), '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
'-DSPCONV_BuildTests=OFF', '-DSPCONV_BuildTests=OFF',
'-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"'] '-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"'
] # -arch=sm_61
cfg = 'Debug' if self.debug else 'Release' cfg = 'Debug' if self.debug else 'Release'
assert cfg == "Release", "pytorch ops don't support debug build." assert cfg == "Release", "pytorch ops don't support debug build."
build_args = ['--config', cfg] build_args = ['--config', cfg]
......
...@@ -49,6 +49,11 @@ def _calculate_fan_in_and_fan_out_hwio(tensor): ...@@ -49,6 +49,11 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
class SparseConvolution(SparseModule): class SparseConvolution(SparseModule):
__constants__ = [
'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
'transposed', 'output_padding', 'fused_bn'
]
def __init__(self, def __init__(self,
ndim, ndim,
in_channels, in_channels,
...@@ -64,7 +69,8 @@ class SparseConvolution(SparseModule): ...@@ -64,7 +69,8 @@ class SparseConvolution(SparseModule):
transposed=False, transposed=False,
inverse=False, inverse=False,
indice_key=None, indice_key=None,
fused_bn=False): fused_bn=False,
use_hash=True):
super(SparseConvolution, self).__init__() super(SparseConvolution, self).__init__()
assert groups == 1 assert groups == 1
if not isinstance(kernel_size, (list, tuple)): if not isinstance(kernel_size, (list, tuple)):
...@@ -96,6 +102,7 @@ class SparseConvolution(SparseModule): ...@@ -96,6 +102,7 @@ class SparseConvolution(SparseModule):
self.subm = subm self.subm = subm
self.indice_key = indice_key self.indice_key = indice_key
self.fused_bn = fused_bn self.fused_bn = fused_bn
self.use_hash = use_hash
self.weight = Parameter( self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels)) torch.Tensor(*kernel_size, in_channels, out_channels))
...@@ -167,16 +174,17 @@ class SparseConvolution(SparseModule): ...@@ -167,16 +174,17 @@ class SparseConvolution(SparseModule):
self.output_padding, self.output_padding,
self.subm, self.subm,
self.transposed, self.transposed,
grid=input.grid) grid=input.grid,
use_hash=self.use_hash)
input.indice_dict[self.indice_key] = (outids, indices, input.indice_dict[self.indice_key] = (outids, indices,
indice_pairs, indice_pairs,
indice_pair_num, indice_pair_num,
spatial_shape) spatial_shape)
if self.fused_bn: if self.fused_bn:
assert self.bias is not None assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight, self.bias, indice_pairs.to(device), out_features = ops.fused_indice_conv(
indice_pair_num, features, self.weight, self.bias, indice_pairs.to(device),
outids.shape[0], self.inverse, self.subm) indice_pair_num, outids.shape[0], self.inverse, self.subm)
else: else:
if self.subm: if self.subm:
out_features = Fsp.indice_subm_conv(features, self.weight, out_features = Fsp.indice_subm_conv(features, self.weight,
...@@ -185,15 +193,14 @@ class SparseConvolution(SparseModule): ...@@ -185,15 +193,14 @@ class SparseConvolution(SparseModule):
outids.shape[0]) outids.shape[0])
else: else:
if self.inverse: if self.inverse:
out_features = Fsp.indice_inverse_conv(features, self.weight, out_features = Fsp.indice_inverse_conv(
indice_pairs.to(device), features, self.weight, indice_pairs.to(device),
indice_pair_num, indice_pair_num, outids.shape[0])
outids.shape[0])
else: else:
out_features = Fsp.indice_conv(features, self.weight, out_features = Fsp.indice_conv(features, self.weight,
indice_pairs.to(device), indice_pairs.to(device),
indice_pair_num, indice_pair_num,
outids.shape[0]) outids.shape[0])
if self.bias is not None: if self.bias is not None:
out_features += self.bias out_features += self.bias
...@@ -214,7 +221,8 @@ class SparseConv2d(SparseConvolution): ...@@ -214,7 +221,8 @@ class SparseConv2d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SparseConv2d, self).__init__( super(SparseConv2d, self).__init__(
2, 2,
in_channels, in_channels,
...@@ -225,7 +233,8 @@ class SparseConv2d(SparseConvolution): ...@@ -225,7 +233,8 @@ class SparseConv2d(SparseConvolution):
dilation, dilation,
groups, groups,
bias, bias,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SparseConv3d(SparseConvolution): class SparseConv3d(SparseConvolution):
...@@ -238,7 +247,8 @@ class SparseConv3d(SparseConvolution): ...@@ -238,7 +247,8 @@ class SparseConv3d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SparseConv3d, self).__init__( super(SparseConv3d, self).__init__(
3, 3,
in_channels, in_channels,
...@@ -249,7 +259,9 @@ class SparseConv3d(SparseConvolution): ...@@ -249,7 +259,9 @@ class SparseConv3d(SparseConvolution):
dilation, dilation,
groups, groups,
bias, bias,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SparseConv4d(SparseConvolution): class SparseConv4d(SparseConvolution):
def __init__(self, def __init__(self,
...@@ -261,7 +273,8 @@ class SparseConv4d(SparseConvolution): ...@@ -261,7 +273,8 @@ class SparseConv4d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SparseConv4d, self).__init__( super(SparseConv4d, self).__init__(
4, 4,
in_channels, in_channels,
...@@ -272,7 +285,8 @@ class SparseConv4d(SparseConvolution): ...@@ -272,7 +285,8 @@ class SparseConv4d(SparseConvolution):
dilation, dilation,
groups, groups,
bias, bias,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SparseConvTranspose2d(SparseConvolution): class SparseConvTranspose2d(SparseConvolution):
...@@ -285,7 +299,8 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -285,7 +299,8 @@ class SparseConvTranspose2d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SparseConvTranspose2d, self).__init__( super(SparseConvTranspose2d, self).__init__(
2, 2,
in_channels, in_channels,
...@@ -297,7 +312,8 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -297,7 +312,8 @@ class SparseConvTranspose2d(SparseConvolution):
groups, groups,
bias, bias,
transposed=True, transposed=True,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SparseConvTranspose3d(SparseConvolution): class SparseConvTranspose3d(SparseConvolution):
...@@ -310,7 +326,8 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -310,7 +326,8 @@ class SparseConvTranspose3d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SparseConvTranspose3d, self).__init__( super(SparseConvTranspose3d, self).__init__(
3, 3,
in_channels, in_channels,
...@@ -322,7 +339,8 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -322,7 +339,8 @@ class SparseConvTranspose3d(SparseConvolution):
groups, groups,
bias, bias,
transposed=True, transposed=True,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SparseInverseConv2d(SparseConvolution): class SparseInverseConv2d(SparseConvolution):
...@@ -369,7 +387,8 @@ class SubMConv2d(SparseConvolution): ...@@ -369,7 +387,8 @@ class SubMConv2d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SubMConv2d, self).__init__( super(SubMConv2d, self).__init__(
2, 2,
in_channels, in_channels,
...@@ -381,7 +400,8 @@ class SubMConv2d(SparseConvolution): ...@@ -381,7 +400,8 @@ class SubMConv2d(SparseConvolution):
groups, groups,
bias, bias,
True, True,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SubMConv3d(SparseConvolution): class SubMConv3d(SparseConvolution):
...@@ -394,7 +414,8 @@ class SubMConv3d(SparseConvolution): ...@@ -394,7 +414,8 @@ class SubMConv3d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SubMConv3d, self).__init__( super(SubMConv3d, self).__init__(
3, 3,
in_channels, in_channels,
...@@ -406,7 +427,9 @@ class SubMConv3d(SparseConvolution): ...@@ -406,7 +427,9 @@ class SubMConv3d(SparseConvolution):
groups, groups,
bias, bias,
True, True,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
class SubMConv4d(SparseConvolution): class SubMConv4d(SparseConvolution):
def __init__(self, def __init__(self,
...@@ -418,7 +441,8 @@ class SubMConv4d(SparseConvolution): ...@@ -418,7 +441,8 @@ class SubMConv4d(SparseConvolution):
dilation=1, dilation=1,
groups=1, groups=1,
bias=True, bias=True,
indice_key=None): indice_key=None,
use_hash=True):
super(SubMConv4d, self).__init__( super(SubMConv4d, self).__init__(
4, 4,
in_channels, in_channels,
...@@ -430,4 +454,5 @@ class SubMConv4d(SparseConvolution): ...@@ -430,4 +454,5 @@ class SubMConv4d(SparseConvolution):
groups, groups,
bias, bias,
True, True,
indice_key=indice_key) indice_key=indice_key,
use_hash=use_hash)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment