v1.1 alpha: add cuda hash implementation

47527b00 · traveller59 · 5df97387 · 47527b00 · 47527b00 · 47527b00
Commit 47527b00 authored May 22, 2019 by traveller59
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WIN32) # true if windows (32 and 64 bit)
 else()
    set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
 endif()
+# set(CMAKE_VERBOSE_MAKEFILE ON)
 find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
 find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -22,6 +22,10 @@ endif()
 find_package(Torch REQUIRED)
+torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
+string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
 # add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 add_compile_definitions(SPCONV_CUDA)
@@ -35,6 +39,7 @@ set(ALL_INCLUDE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
 add_subdirectory(src/spconv)
 add_subdirectory(src/utils)
+add_subdirectory(src/hash)
 if (SPCONV_BuildTests)
    include(CTest) #adds option BUILD_TESTING (default ON)

--- a/README.md
+++ b/README.md
@@ -8,6 +8,10 @@ The GPU Indice Generation algorithm is a unofficial implementation of paper [SEC
 This project only support CUDA 9.0+. If you are using cuda 8.0, please update it to 9.0.
+## News:
+2019-5-22: spconv v1.1 alpha released, now cuda hash implementation will be default. you can use ```use_hash=False``` to use dense implementation. you may see some message during running, they will be removed in future.
 ## Install on Ubuntu 16.04/18.04
 0. Use ```git clone xxx.git --recursive``` to clone this repo.

--- a/include/hash/cuda_util.h
+++ b/include/hash/cuda_util.h
+#ifndef _CUDA_UTIL_H_
+#define _CUDA_UTIL_H_
+#if CUDART_VERSION >= 4000
+#define CUDA_DEVICE_SYNCHRONIZE( )   cudaDeviceSynchronize();
+#else
+#define CUDA_DEVICE_SYNCHRONIZE( )   cudaThreadSynchronize();
+#endif
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } }
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
+//! Check for CUDA error
+#ifdef _DEBUG
+#  define CUDA_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = CUDA_DEVICE_SYNCHRONIZE();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    }
+#else
+#  define CUDA_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    }
+#endif
+#endif
\ No newline at end of file
--- a/include/hash/debugging.h
+++ b/include/hash/debugging.h
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+/**
+ * @file
+ * debugging.h
+ *
+ * @brief Debugging/statistics/performance utilities header for hash tables.
+ */
+#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
+#define CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
+#include "definitions.h"
+#include <cuda_runtime_api.h>
+#include <vector_types.h>
+#include <algorithm>
+namespace cudahash {
+//! @name Debugging functions
+/// @{
+void TakeHashFunctionStatistics(const unsigned   num_keys,
+                                const unsigned  *d_keys,
+                                const unsigned   table_size,
+                                const uint2     *constants,
+                                const unsigned   kNumHashFunctions);
+//! Output how many probes were required by each thread to perform the retrieval.
+/*! @param[in]  n_queries           Number of queries being performed.
+ *  @param[in]  d_retrieval_probes  Device array: the number of probes taken for each thread's retrieval.
+ *  @param[in]  n_functions         Number of hash functions used.
+ */
+void OutputRetrievalStatistics(const unsigned  n_queries,
+                               const unsigned *d_retrieval_probes,
+                               const unsigned  n_functions);
+//! Outputs information about how many iterations threads required to successfully cuckoo hash.
+/*! @param[in]  n                       Number of keys in the input.
+ *  @param[in]  d_iterations_taken      Device mem: Number of iterations each thread took.
+ *  @param[in]  d_max_iterations_taken  Device mem: Largest number of iterations taken by any thread.
+ */
+void OutputBuildStatistics(const unsigned  n,
+                           const unsigned *d_iterations_taken);
+//! Prints out the contents of the stash.
+void PrintStashContents(const Entry *d_stash);
+//! Checks if a key is assigned the same slot by different hash functions.
+bool CheckAssignedSameSlot(const unsigned  N,
+                           const unsigned  num_keys,
+                           const unsigned *d_keys,
+                           const unsigned  table_size,
+                                 uint2    *constants);
+/// @}
+}; // namespace CuckooHashing
+#endif
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/include/hash/definitions.h
+++ b/include/hash/definitions.h
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// -------------------------------------------------------------
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// -------------------------------------------------------------
+/**
+ * @file definitions.h
+ *
+ * @brief Stores configuration flags and definitions for hard-coded values in
+ * hash table implementations.
+ */
+#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
+#define CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
+#include <cstdio>
+#include <limits>
+#include <tensorview/tensorview.h>
+/* --------------------------------------------------------------------------
+   Debugging.
+   -------------------------------------------------------------------------- */
+#ifdef _DEBUG
+//! Forces the hash functions to generate a full set of slots for each key when
+//! not using subtables.
+// #define FORCEFULLY_GENERATE_NO_CYCLES
+//! Count how many iterations are taken to insert/find items.
+#define TRACK_ITERATIONS
+//! Count how many items fail to be inserted when the hash table fails to build.
+#define COUNT_UNINSERTED
+//! Take some statistics on the hash functions.
+#define TAKE_HASH_FUNCTION_STATISTICS
+#ifdef TAKE_HASH_FUNCTION_STATISTICS
+//! Determine how many keys hash into each table slot.
+#define COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
+//! Determine how many unique slots a key is assigned.
+#define COUNT_HOW_MANY_HAVE_CYCLES
+#endif
+#endif
+#ifdef USE_DAN_OUTPUT
+#include <Utilities/output.h>
+//! Logs any error messages.
+inline void PrintMessage(const char *message, const bool error = false) {
+  PrintIndentedMessage(message, error);
+}
+#else
+//! Prints a message out to the console.
+inline void PrintMessage(const char *message, const bool error = false) {
+  if (error) {
+    printf("!!! %s\n", message);
+  } else {
+    printf("%s\n", message);
+  }
+}
+#endif
+/* -------------------------------------------------------------------------
+   Hash table constants and definitions.
+   ------------------------------------------------------------------------- */
+namespace cudahash {
+/**
+ * \addtogroup cudpp_hash_data_structures
+ *
+ * @{
+ */
+typedef unsigned long long
+    Entry; //!< A key and its value are stored in a 64-bit number.  The key is
+           //!< stored in the upper 32 bits.
+const unsigned kMaxRestartAttempts = 10; //!< Number of build attempts.
+const unsigned kKeyEmpty = 0xffffffffu; //!< Signifies empty slots in the table.
+const unsigned kNotFound =
+    0xffffffffu; //!< Signifies that a query key was not found.
+const unsigned kMaxHashFunctions =
+    5; //!< Maximum number of hash functions allowed.
+const unsigned kStashSize =
+    101; //!< How many slots the stash hash table contains.
+//! Value indicating that a hash table slot has no valid item within it.
+const Entry kEntryEmpty = Entry(kKeyEmpty) << 32;
+//! Value returned when a query fails.
+const Entry kEntryNotFound = (Entry(kKeyEmpty) << 32) + kNotFound;
+//! Number of threads to put in a thread block.
+const unsigned kBlockSize = 64;
+//! Number of blocks to put along each axis of the grid.
+const unsigned kGridSize = 16384;
+//! Minimum table sizes for 2 through 5 functions.
+const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(),
+                                     std::numeric_limits<float>::max(),
+                                     2.01f,
+                                     1.1f,
+                                     1.03f,
+                                     1.02f};
+/** @} */ // end cudpp_hash_data_structures
+}; // namespace cudahash
+#endif
--- a/include/hash/hash_functions.h
+++ b/include/hash/hash_functions.h
+/*! @file hash_functions.h
+ *  @brief Hash function code.
+ */
+#ifndef HASH_FUNCTIONS__H
+#define HASH_FUNCTIONS__H
+#include <tensorview/tensorview.h>
+#include <vector_types.h>
+#include "definitions.h"
+namespace cudahash {
+//! Prime number larger than the largest practical hash table size.
+const unsigned kPrimeDivisor = 4294967291u;
+// https://www.alpertron.com.ar/ECM.HTM
+// const unsigned long kPrimeDivisor = 18446744073709551557lu
+// const long kPrimeDivisor = 9223372036854775783l
+// const Entry kPrimeDivisor = 4300000013lu;
+// const unsigned kPrimeDivisor = 334214459;
+//! Generates a set of linear hash function constants.
+/*! @param[in]  N           Number of hash functions.
+    @param[out] constants   CPU pointer to the constants.
+    @param[in]  num_keys    Debug only: How many keys are in the input.
+    @param[in]  d_keys      Debug only: Device memory array containing the input keys.
+    @param[in]  table_size  Debug only: Size of the hash table.
+ */
+void GenerateFunctions(const unsigned  N,
+                       const unsigned  num_keys,
+                       const unsigned *d_keys,
+                       const unsigned  table_size,
+                             uint2    *constants);
+//! Container for all of the hash functions.
+template <unsigned N>
+struct Functions {
+  //! The constants required for all of the hash functions, including the stash.  Each function requires 2.
+  uint2 constants[N];
+  //! Generate new hash function constants.
+  /*! The parameters are only used for debugging and examining the key distribution.
+      \param[in] num_keys   Debug: Number of keys in the input.
+      \param[in] d_keys     Debug: Device array of the input keys.
+      \param[in] table_size Debug: Size of the hash table.
+  */
+  void Generate(const unsigned  num_keys,
+                const unsigned *d_keys,
+                const unsigned  table_size) {
+    GenerateFunctions(N, num_keys, d_keys, table_size, constants);
+  }
+};
+//! Computes the value of a hash function for a given key.
+/*! \param[in] constants  Constants used by the hash function.
+  ! \param[in] key        Key being hashed.
+  ! \returns              The value of the hash function for the key.
+ */
+inline __device__ __host__
+unsigned hash_function_inner(const uint2    constants,
+                             const unsigned key) {
+#if 1                             
+  // Fast version.                             
+  return ((constants.x ^ key) + constants.y) % kPrimeDivisor;
+#else
+  // Slow version.
+  return ((unsigned long long)constants.x * key + constants.y) % kPrimeDivisor;
+#endif
+}                             
+//! Computes the value of a hash function for a given key.
+/*! \param[in] functions        All of the constants used by the hash functions.
+  ! \param[in] which_function   Which hash function is being used.
+  ! \param[in] key              Key being hashed.
+  ! \returns                    The value of a hash function with a given key.
+ */
+template <unsigned kNumHashFunctions>
+TV_HOST_DEVICE_INLINE
+unsigned hash_function(const Functions<kNumHashFunctions> functions,
+                       const unsigned which_function,
+                       const unsigned key) {
+  return hash_function_inner(functions.constants[which_function], key);
+}
+//! Simple hash function used by the stash.
+TV_HOST_DEVICE_INLINE
+unsigned stash_hash_function(const uint2 stash_constants,
+                             const unsigned key) {
+  return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
+}
+};  // namespace CuckooHashing
+#endif
--- a/include/hash/hash_table.cuh
+++ b/include/hash/hash_table.cuh
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+/**
+ * @file hash_table.cuh
+ *
+ * @brief Implements kernel and __device__ functions for a basic hash table.
+ */
+#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
+#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
+#include "definitions.h"
+#include "hash_table.h"
+#include <tensorview/tensorview.h>
+#include <driver_types.h>
+namespace cudahash {
+//! Makes an 64-bit Entry out of a key-value pair for the hash table.
+TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {
+  return (Entry(key) << 32) + value;
+}
+//! Returns the key of an Entry.
+TV_HOST_DEVICE_INLINE unsigned get_key(Entry entry) {
+  return (unsigned) (entry >> 32);
+}
+//! Returns the value of an Entry.
+TV_HOST_DEVICE_INLINE unsigned get_value(Entry entry) {
+  return (unsigned) (entry & 0xffffffff);
+}
+//! @name Internal
+//! @brief Functions used for building the hash table.
+//! @{
+//! Fills the entire array with a specific value.
+template <class T> __global__
+void clear_table(const unsigned  table_size,
+                 const T         value,
+                       T        *table)
+{       
+  unsigned thread_index = threadIdx.x +
+                          blockIdx.x * blockDim.x +
+                          blockIdx.y * blockDim.x * gridDim.x;
+  if (thread_index < table_size) {
+    table[thread_index] = value;
+  }
+}
+//! Determine where in the hash table the key could be located.
+template <unsigned kNumHashFunctions>
+__device__ void
+KeyLocations(const Functions<kNumHashFunctions> constants,
+             const unsigned  table_size,
+             const unsigned  key,
+                   unsigned  locations[kNumHashFunctions])
+{
+  // Compute all possible locations for the key in the big table.
+  #pragma unroll
+  for (int i = 0; i < kNumHashFunctions; ++i) {
+    locations[i] = hash_function(constants, i, key) % table_size;
+  }
+}
+//! @}
+/* --------------------------------------------------------------------------
+   Retrieval functions.
+   -------------------------------------------------------------------------- */
+//! Answers a single query.
+/*! @ingroup PublicInterface
+ *  @param[in]  key                   Query key
+ *  @param[in]  table_size            Size of the hash table
+ *  @param[in]  table                 The contents of the hash table
+ *  @param[in]  constants             The hash functions used to build the table
+ *  @param[in]  stash_constants       The hash function used to build the stash
+ *  @param[in]  stash_count           The number of items in the stash
+ *  @param[out] num_probes_required   Debug only: The number of probes required to resolve the query.
+ *  @returns The value of the query key, if the key exists in the table.  Otherwise, \ref kNotFound will be returned.
+ */
+template <unsigned kNumHashFunctions> __device__
+unsigned retrieve(const unsigned                      query_key,
+                  const unsigned                      table_size,
+                  const Entry                        *table,
+                  const Functions<kNumHashFunctions>  constants,
+                  const uint2                         stash_constants,
+                  const unsigned                      stash_count,
+                        unsigned                     *num_probes_required = NULL)
+{
+  // Identify all of the locations that the key can be located in.
+  unsigned locations[kNumHashFunctions];
+  KeyLocations(constants, table_size, query_key, locations);
+  // Check each location until the key is found.
+  unsigned num_probes = 1;
+  Entry    entry      = table[locations[0]];
+  unsigned key        = get_key(entry);
+  #pragma unroll
+  for (unsigned i = 1; i < kNumHashFunctions; ++i) {
+    if (key != query_key && key != kNotFound) {
+      num_probes++;
+      entry = table[locations[i]];
+      key = get_key(entry);
+    }
+  }
+  // Check the stash.
+  if (stash_count && get_key(entry) != query_key) {
+    num_probes++;
+    const Entry *stash = table + table_size;
+    unsigned slot = stash_hash_function(stash_constants, query_key);
+    entry = stash[slot];
+  }
+#ifdef TRACK_ITERATIONS
+  if (num_probes_required) {
+    *num_probes_required = num_probes;
+  }
+#endif
+  if (get_key(entry) == query_key) {
+    return get_value(entry);
+  } else {
+    return kNotFound;
+  }
+}
+//! Perform a retrieval from a basic hash table.  Each thread manages a single query.
+template <unsigned kNumHashFunctions> __global__
+void hash_retrieve(const unsigned                      n_queries,
+                   const unsigned                     *keys_in,
+                   const unsigned                      table_size,
+                   const Entry                        *table,
+                   const Functions<kNumHashFunctions>  constants,
+                   const uint2                         stash_constants,
+                   const unsigned                      stash_count,
+                         unsigned                     *values_out,
+                         unsigned                     *num_probes_required = NULL)
+{
+  // Get the key.
+  unsigned thread_index = threadIdx.x +
+                          blockIdx.x * blockDim.x +
+                          blockIdx.y * blockDim.x * gridDim.x;
+  if (thread_index >= n_queries)
+    return;
+  unsigned key = keys_in[thread_index];
+  values_out[thread_index] = retrieve<kNumHashFunctions>
+                                     (key,
+                                      table_size,
+                                      table,
+                                      constants,
+                                      stash_constants,
+                                      stash_count,
+                                      (num_probes_required ? num_probes_required + thread_index : NULL));
+}       
+/* --------------------------------------------------------------------------
+   Build a cuckoo hash table.
+   -------------------------------------------------------------------------- */
+//! @name Internal
+//! @{
+//! Determine where to insert the key next.  The hash functions are used in round-robin order.
+template <unsigned kNumHashFunctions> __device__
+unsigned determine_next_location(const Functions<kNumHashFunctions> constants,
+                                 const unsigned                     table_size,
+                                 const unsigned                     key,
+                                 const unsigned                     previous_location) {
+  // Identify all possible locations for the entry.
+  unsigned locations[kNumHashFunctions];
+  #pragma unroll
+  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
+    locations[i] = hash_function(constants, i, key) % table_size;
+  }
+  // Figure out where the item should be inserted next.
+  unsigned next_location = locations[0];
+  #pragma unroll
+  for (int i = kNumHashFunctions - 2; i >= 0; --i) {
+    next_location = (previous_location == locations[i] ? locations[i+1]
+                                                       : next_location);
+  }
+  return next_location;
+}
+//! Attempts to insert a single entry into the hash table.
+/*! This process stops after a certain number of iterations.  If the thread is
+    still holding onto an item because of an eviction, it tries the stash.
+    If it fails to enter the stash, it returns false.
+    Otherwise, it succeeds and returns true.
+ */
+template <unsigned kNumHashFunctions> __device__
+bool insert(const unsigned                      table_size,
+            const Functions<kNumHashFunctions>  constants,
+            const uint2                         stash_constants,
+            const unsigned                      max_iteration_attempts,
+                  Entry                        *table,
+                  unsigned                     *stash_count,
+                  Entry                         entry,
+                  unsigned                     *iterations_used) {
+  unsigned key = get_key(entry);
+  // The key is always inserted into its first slot at the start.
+  unsigned location = hash_function(constants, 0, key) % table_size;
+  // Keep inserting until an empty slot is found or the eviction chain grows too large.
+  for (unsigned its = 1; its <= max_iteration_attempts; its++) {
+    // Insert the new entry.
+    entry = atomicExch(&table[location], entry);
+    key   = get_key(entry);
+    // If no key was evicted, we're done.
+    if (key == kKeyEmpty) {
+      *iterations_used = its;
+      break;
+    }
+    // Otherwise, determine where the evicted key will go.
+    location = determine_next_location(constants, table_size, key, location);
+  }
+  if (key != kKeyEmpty) {
+    // Shove it into the stash.
+    unsigned slot = stash_hash_function(stash_constants, key);
+    Entry *stash = table + table_size;
+    Entry replaced_entry = atomicCAS(stash + slot, kEntryEmpty, entry);
+    if (replaced_entry != kEntryEmpty) {
+      return false;
+    } else {
+      atomicAdd(stash_count, 1);
+    }
+  }
+  return true;
+}
+// Build a basic hash table, using one big table.
+template <unsigned kNumHashFunctions> __global__
+void CuckooHash(const unsigned   n_entries,
+                const unsigned  *keys,
+                const unsigned  *values,
+                const unsigned   table_size,
+                const Functions<kNumHashFunctions>  constants,
+                const unsigned   max_iteration_attempts,
+                      Entry     *table,
+                      uint2      stash_constants,
+                      unsigned  *stash_count,
+                      unsigned  *failures,
+                      unsigned  *iterations_taken = nullptr) {
+  // Check if this thread has an item and if any previous threads failed.
+  unsigned thread_index = threadIdx.x +
+                          blockIdx.x * blockDim.x +
+                          blockIdx.y * blockDim.x * gridDim.x;
+  if (thread_index >= n_entries || *failures)
+    return;
+  Entry entry = make_entry(keys[thread_index], values[thread_index]);
+  unsigned iterations = 0;
+  bool success = insert<kNumHashFunctions>
+                       (table_size, constants, stash_constants,
+                       max_iteration_attempts, table, stash_count, entry, &iterations);
+  if (success == false) {
+    // The eviction chain grew too large.  Report failure.
+  #ifdef COUNT_UNINSERTED
+    atomicAdd(failures, 1);
+  #else
+    *failures = 1;
+  #endif
+  }
+#ifdef TRACK_ITERATIONS
+  iterations_taken[thread_index] = iterations;
+#endif
+}       
+//! @}
+};  // namespace CuckooHashing
+#endif
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/include/hash/hash_table.h
+++ b/include/hash/hash_table.h
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+/**
+ * @file hash_table.h
+ *
+ * @brief Header for a basic hash table that stores one value per key.
+ */
+#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
+#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
+#include "definitions.h"
+#include "hash_functions.h"
+#include <cstdio>
+/** \addtogroup cudpp_app 
+  * @{
+  */
+/** \addtogroup cudpp_hash_data_structures
+ * @{
+ */
+/* --------------------------------------------------------------------------
+   Doxygen definitions.
+   -------------------------------------------------------------------------- */
+/*! @namespace CudaHT
+ *  @brief Encapsulates the hash table library.
+ */
+/*! @namespace CuckooHashing
+ *  @brief Encapsulates the cuckoo hash table that uses stashes.
+ */
+/* -------------------------------------------------------------------------
+   Hash table code.
+   ------------------------------------------------------------------------- */
+namespace cudahash {
+//! Compute how many thread blocks are required for the given number of threads.
+dim3 ComputeGridDim(unsigned threads);
+//! Compute how long an eviction chain is allowed to become for a given input size.
+/*! \param[in] num_keys       Number of keys in the input.
+ *  \param[in] table_size     Number of slots in the hash table.
+ *  \param[in] num_functions  Number of hash functions being used.
+ *  \returns The number of iterations that should be allowed.
+ *
+ *  The latter two parameters are only needed when using an empirical
+ *  formula for computing the chain length.
+ */
+unsigned ComputeMaxIterations(const unsigned num_keys,
+                              const unsigned table_size,
+                              const unsigned num_functions);
+//! Basic hash table that stores one value for each key.
+/*! The input consists of two unsigned arrays of keys and values.
+ *  None of the keys are expected to be repeated.
+ *
+ *  @todo Templatize the interface without forcing the header file to
+ *  have CUDA calls.
+ *  @ingroup cudpp_app
+ */
+class HashTable {
+ public:
+  HashTable();
+  virtual ~HashTable() {Release();}
+  //! Initialize the hash table's memory. Must be called before \ref
+  //! Build() and after the random number generator has been seeded.
+  /*! @param[in] max_input_size   Largest expected number of items in the input.
+   *  @param[in] space_usage Size of the hash table relative to the
+   *                         input. Bigger tables are faster to build
+   *                         and retrieve from.
+   *  @param[in] num_functions Number of hash functions to use. May be
+   *                           2-5. More hash functions make it easier
+   *                           to build the table, but increase
+   *                           retrieval times.
+   *  @returns Whether the hash table was initialized successfully (true) 
+   *           or not (false).
+   *
+   *  The minimum space usage is dependent on the number of functions
+   *  being used; for two through five functions, the minimum space
+   *  usage is 2.1, 1.1, 1.03, and 1.02 respectively.
+   */
+  virtual bool Initialize(const unsigned max_input_size,
+                          const float    space_usage    = 1.25,
+                          const unsigned num_functions  = 4);
+  //! Free all memory.
+  virtual void Release();
+  //! Build the hash table.
+  /*! @param[in] input_size   Number of key-value pairs being inserted.
+   *  @param[in] d_keys       Device memory array containing all of the input 
+   *                          keys.
+   *  @param[in] d_vals       Device memory array containing the keys' values.
+   *  @returns Whether the hash table was built successfully (true) or 
+   *           not (false).
+   *
+   *  Several attempts are allowed to build the hash table in case of failure.
+   *  The input keys are expected to be completely unique.
+   *  To reduce the chance of a failure, increase the space usage or number of 
+   *  functions.
+   *  Keys are not allowed to be equal to cudahash::kKeyEmpty.
+   */
+  virtual bool Build(const unsigned  input_size,
+                     const unsigned *d_keys,
+                     const unsigned *d_vals);
+  //! Query the hash table.
+  /*! @param[in] n_queries        Number of keys in the query set.
+   *  @param[in] d_query_keys     Device memory array containing all of
+   *                              the query keys.
+   *  @param[in] d_query_results  Values for the query keys.
+   *
+   *  kNotFound is returned for any query key that failed to be found
+   *  in the table.
+   */
+  virtual void Retrieve(const unsigned  n_queries,
+                        const unsigned *d_query_keys,
+                              unsigned *d_query_results);
+  //! @name Accessors
+  /// @brief Mainly needed to use the __device__ CudaHT::retrieve()
+  /// function directly.
+  /// @{
+  //! Returns how many slots the hash table has.
+  inline unsigned     get_table_size()         const {return table_size_;}
+  //! Returns how many items are stored in the stash.
+  inline unsigned     get_stash_count()        const {return stash_count_;}
+  //! Returns the constants used by the stash.
+  inline uint2        get_stash_constants()    const {return stash_constants_;}
+  //! Returns the hash table contents.
+  inline const Entry* get_contents()           const {return d_contents_;}
+  //! Returns the number of hash functions being used.
+  inline unsigned     get_num_hash_functions() const {return 
+                                                      num_hash_functions_;}
+  //! When using two hash functions, returns the constants.
+  inline Functions<2> get_constants_2()        const {return constants_2_;}
+  //! When using three hash functions, returns the constants.
+  inline Functions<3> get_constants_3()        const {return constants_3_;}
+  //! When using four hash functions, returns the constants.
+  inline Functions<4> get_constants_4()        const {return constants_4_;}
+  //! When using five hash functions, returns the constants.
+  inline Functions<5> get_constants_5()        const {return constants_5_;}
+  /// @}
+  inline Entry * data(){return d_contents_;}
+  inline const Entry * data() const {return d_contents_;}
+ protected:
+  unsigned      table_size_;           //!< Size of the hash table.
+  unsigned      num_hash_functions_;   //!< Number of hash functions being used.
+  Entry        *d_contents_;           //!< Device memory: The hash table contents.  The stash is stored at the end.
+  unsigned      stash_count_;          //!< Number of key-value pairs currently stored.
+  uint2         stash_constants_;      //!< Hash function constants for the stash.
+  Functions<2>  constants_2_;          //!< Constants for a set of two hash functions.
+  Functions<3>  constants_3_;          //!< Constants for a set of three hash functions.
+  Functions<4>  constants_4_;          //!< Constants for a set of four hash functions.
+  Functions<5>  constants_5_;          //!< Constants for a set of five hash functions.
+  unsigned     *d_failures_;           //!< Device memory: General use error flag.
+};
+/*! @name Internal
+ *  @{
+ */
+namespace CUDAWrapper {
+//! Fills a 64-bit array with a particular value.
+void ClearTable(const unsigned  slots_in_table,
+                const Entry     fill_value,
+                      Entry    *d_array);
+//! Calls the Cuckoo Hash construction kernel.
+void CallCuckooHash(const unsigned      n_entries,
+                    const unsigned      num_hash_functions,
+                    const unsigned     *d_keys,
+                    const unsigned     *d_values,
+                    const unsigned      table_size,
+                    const Functions<2>  constants_2,
+                    const Functions<3>  constants_3,
+                    const Functions<4>  constants_4,
+                    const Functions<5>  constants_5,
+                    const unsigned      max_iteration_attempts,
+                          Entry        *d_contents,
+                          uint2         stash_constants,
+                          unsigned     *d_stash_count,
+                          unsigned     *d_failures,
+                          unsigned     *d_iterations_taken);
+//! Calls the kernel that performs retrievals.
+void CallHashRetrieve(const unsigned      n_queries,
+                      const unsigned      num_hash_functions,
+                      const unsigned     *keys_in,
+                      const unsigned      table_size,
+                      const Entry        *table,
+                      const Functions<2>  constants_2,
+                      const Functions<3>  constants_3,
+                      const Functions<4>  constants_4,
+                      const Functions<5>  constants_5,
+                      const uint2         stash_constants,
+                      const unsigned      stash_count,
+                            unsigned     *values_out);
+};
+/// @}
+};  // namespace CuckooHashing
+/** @} */ // end hash table data structures
+/** @} */ // end cudpp_app
+#endif
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/include/hash/mt19937ar.h
+++ b/include/hash/mt19937ar.h
+void init_genrand(unsigned long s);
+void init_by_array(unsigned long init_key[], int key_length);
+unsigned long genrand_int32(void);
+long genrand_int31(void);
+double genrand_real1(void);
+double genrand_real2(void);
+double genrand_real3(void);
+double genrand_res53(void);
--- a/include/spconv/geometry.h
+++ b/include/spconv/geometry.h
@@ -271,13 +271,14 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
  Index* validPoints = validPoints_.data();
  Index *pointPtr = nullptr;
-  Index index = 0;
  for (int j = 0; j < numActIn; ++j) {
+    Index index = 0;
    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
                                         outSpatialShape) +
            spatialVolume * indicesIn(j, 0);
    gridsOut[index] = j;
  }
+  Index index = 0;
  for (int j = 0; j < numActIn; ++j) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,

--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
@@ -14,9 +14,10 @@
 #ifndef INDICE_CU_H_
 #define INDICE_CU_H_
-#include <tensorview/tensorview.h>
+#include <hash/hash_table.cuh>
-#include <tensorview/helper_kernel.cu.h>
 #include <spconv/geometry.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/tensorview.h>
 namespace spconv {
 template <typename Index, typename IndexGrid, unsigned NDim,
@@ -127,6 +128,49 @@ __global__ void assignGridAndIndiceOutKernel(
  }
 }
+template <typename Index, unsigned NDim,
+          unsigned kNumHashFunctions = 4>
+__global__ void assignIndiceOutKernel(
+    tv::TensorView<Index> indicesOut, int numAct,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
+  Index index;
+  auto indicesOutPtr = indicesOut.data();
+  for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
+    index = indicePairUnique[ix];
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
+    indicesOut[ix * (NDim + 1)] = index % batchSize;
+  }
+}
+template <typename Index, typename IndexGrid, unsigned NDim,
+          unsigned kNumHashFunctions = 4>
+__global__ void
+assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
+                            tv::TensorView<Index> indicePairs,
+                            tv::TensorView<Index> indicePairUnique,
+                            unsigned table_size, const cudahash::Entry *table,
+                            cudahash::Functions<kNumHashFunctions> constants,
+                            uint2 stash_constants, unsigned stash_count) {
+  Index index;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index > -1) {
+        auto val =
+            cudahash::retrieve((unsigned)(index), table_size,
+                               table, constants, stash_constants, stash_count);
+        assert(val != cudahash::kNotFound);
+        indicePairs(i, 1, ix) = (unsigned)val;
+      }
+    }
+  }
+}
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void
 assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
@@ -150,8 +194,8 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void
 prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
-                  tv::TensorView<IndexGrid> gridsOut,
+                      tv::TensorView<IndexGrid> gridsOut,
-                  const tv::SimpleVector<Index, NDim> outSpatialShape) {
+                      const tv::SimpleVector<Index, NDim> outSpatialShape) {
  auto numActIn = indicesIn.dim(0);
  Index spatialVolume = 1;
 #pragma unroll
@@ -167,6 +211,29 @@ prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
  }
 }
+template <typename Index, unsigned NDim>
+__global__ void
+prepareSubMHashKernel(tv::TensorView<const Index> indicesIn,
+                      unsigned* keys,
+                      unsigned* values,
+                      const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+                                         outSpatialShape.data()) +
+            spatialVolume * indicesIn(ix, 0);
+    keys[ix] = index;
+    values[ix] = ix;
+  }
+}
 template <typename Index, typename IndexGrid, unsigned NDim,
          int KernelMaxVolume = 256>
 __global__ void getSubMIndicePairsKernel(
@@ -206,6 +273,52 @@ __global__ void getSubMIndicePairsKernel(
  }
 }
+template <typename Index, unsigned NDim,
+          int KernelMaxVolume = 256, unsigned kNumHashFunctions=4>
+__global__ void getSubMIndicePairsHashKernel(
+    tv::TensorView<const Index> indicesIn,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape,
+    unsigned table_size, const cudahash::Entry *table,
+    cudahash::Functions<kNumHashFunctions> constants,
+    uint2 stash_constants, unsigned stash_count) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (int i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      auto val =
+          cudahash::retrieve((unsigned)(index), table_size,
+                              table, constants, stash_constants, stash_count);
+      if (val != cudahash::kNotFound) {
+        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        indicePairs(offset, 1, oldNum) = val;
+        indicePairs(offset, 0, oldNum) = ix;
+      }
+    }
+  }
+}
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void resetGridKernel(const Index *indicePairUnique,
                                tv::TensorView<IndexGrid> gridsOut,
@@ -215,6 +328,14 @@ __global__ void resetGridKernel(const Index *indicePairUnique,
  }
 }
+template <typename T>
+__global__ void arangeKernel(T *data, int size) {
+  for (int ix : tv::KernelLoopX<int>(size)) {
+    data[ix] = ix;
+  }
+}
 template <typename Index, typename IndexGrid, unsigned NDim>
 __global__ void
 resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,

--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
@@ -44,7 +44,7 @@ struct CreateConvIndicePairFunctorP2
        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
        tv::TensorView<Index> indicePairUnique,
        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose,
-        bool resetGrid=false);
+        bool resetGrid=false, bool useHash=true);
 };
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
@@ -58,7 +58,8 @@ struct CreateConvIndicePairFunctor
        const tv::SimpleVector<Index, NDim> stride,
        const tv::SimpleVector<Index, NDim> padding,
        const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false,
+        bool useHash=true);
 };
 template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
@@ -71,7 +72,8 @@ struct CreateSubMIndicePairFunctor
        const tv::SimpleVector<Index, NDim> stride,
        const tv::SimpleVector<Index, NDim> padding,
        const tv::SimpleVector<Index, NDim> dilation,
-        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false,
+        bool useHash=true);
 };
 } // namespace functor
 } // namespace spconv

--- a/include/spconv/pillar_scatter_functor.h
+++ b/include/spconv/pillar_scatter_functor.h
+// Copyright 2019 Yan Yan
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
+#define POINTPILLARS_SCATTER_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename T, typename Index>
+struct PointPillarScatter
+{
+    void operator()(const Device& d, tv::TensorView<T> canvas,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const T> coors);
+};
+} // namespace functor
+} // namespace spconv
+#endif
\ No newline at end of file
--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PILLAR_SCATTER_OP_H_
+#define PILLAR_SCATTER_OP_H_
+#include <cuda_runtime_api.h>
+#include <spconv/pillar_scatter_functor.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <typename T>
+torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
+                                 torch::Tensor shape) {
+  TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
+  TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
+  TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
+  TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
+  TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
+  TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
+  TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
+  tv::check_torch_dtype<int>(shape);
+  tv::check_torch_dtype<T>(coors);
+  auto shapeData = shape.data<int>();
+  torch::Tensor canvas =
+      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
+                   features.options());
+  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
+  functor::PointPillarScatter<tv::GPU, T, int> ftor;
+  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas), tv::torch2tv<const T>(features.squeeze()),
+       tv::torch2tv<const T>(coors.squeeze()));
+  return canvas;
+}
+} // namespace spconv
+#endif
\ No newline at end of file
--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
@@ -29,7 +29,7 @@ using namespace pybind11::literals;
 template <typename DType, int NDim>
 int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<int> coors,
+                          py::array_t<DType> voxel_point_mask, py::array_t<int> coors,
                          py::array_t<int> num_points_per_voxel,
                          py::array_t<int> coor_to_voxelidx,
                          std::vector<DType> voxel_size,
@@ -37,6 +37,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
                          int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
@@ -79,6 +80,7 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
+      voxel_point_mask_rw(voxelidx, num) = DType(1);
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
      }
@@ -87,23 +89,23 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
  }
  return voxel_num;
 }
 template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels,
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
-                          py::array_t<DType> means,
+                               py::array_t<DType> voxel_point_mask, py::array_t<DType> voxels,
-                          py::array_t<int> coors,
+                               py::array_t<DType> means, py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
+                               py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
+                               std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
+                               std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
+                               int max_voxels) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto means_rw = means.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
@@ -146,19 +148,21 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
+      voxel_point_mask_rw(voxelidx, num) = DType(1);
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
      }
      num_points_per_voxel_rw(voxelidx) += 1;
      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
      }
    }
  }
  for (int i = 0; i < voxel_num; ++i) {
    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j){
+    for (int j = num; j < max_points; ++j) {
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(i, j, k) = means_rw(i, k);
      }
@@ -168,165 +172,19 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
 }
 template <typename DType, int NDim>
-int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels,
+int points_to_voxel_3d_with_filtering(
-                          py::array_t<DType> height,
+    py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> maxs,
+    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask, py::array_t<DType> mins,
-                          py::array_t<int> coors,
+    py::array_t<DType> maxs, py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-                          py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-                          std::vector<DType> voxel_size,
+    int max_points, int max_voxels, int block_factor, int block_size,
-                          std::vector<DType> coors_range, int max_points,
+    DType height_threshold, DType height_high_threshold) {
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        break;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-        height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
-        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    for (int k = 0; k < num_features; ++k) {
-      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
-    }
-  }
-  return voxel_num;
-}
-template <typename DType, int NDim>
-int block_filtering(py::array_t<DType> points, 
-                          py::array_t<int> mask,
-                          py::array_t<DType> height,
-                          py::array_t<DType> maxs,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range,
-                          int max_voxels,
-                          DType eps) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto height_rw = height.template mutable_unchecked<1>();
-  auto maxs_rw = maxs.template mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-    }
-    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
-    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
-  }
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
-      mask(i) = 0;
-    }
-  }
-}
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<int> voxel_mask,
-                          py::array_t<DType> mins,
-                          py::array_t<DType> maxs,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, 
-                          int max_points,
-                          int max_voxels,
-                          int block_factor,
-                          int block_size,
-                          DType height_threshold) {
  auto points_rw = points.template mutable_unchecked<2>();
  auto mins_rw = mins.template mutable_unchecked<2>();
  auto maxs_rw = maxs.template mutable_unchecked<2>();
  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
  auto coors_rw = coors.mutable_unchecked<2>();
  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
@@ -340,7 +198,7 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
  int coor[NDim];
  int c;
  int grid_size[NDim];
  DType max_value, min_value;
  for (int i = 0; i < NDim; ++i) {
    grid_size[i] =
@@ -376,13 +234,16 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
    }
    num = num_points_per_voxel_rw(voxelidx);
    if (num < max_points) {
+      voxel_point_mask_rw(voxelidx, num) = DType(1);
      for (int k = 0; k < num_features; ++k) {
        voxels_rw(voxelidx, num, k) = points_rw(i, k);
      }
      block_coor[0] = coor[1] / block_factor;
      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      mins_rw(block_coor[0], block_coor[1]) =
-      maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
      num_points_per_voxel_rw(voxelidx) += 1;
    }
  }
@@ -394,21 +255,23 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
    block_coor[1] = coor[2] / block_factor;
    min_value = mins_rw(block_coor[0], block_coor[1]);
    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0]-block_size/2);
+    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2);
+    stopx =
-    starty = std::max(0, block_coor[1]-block_size/2);
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-    for (int j = startx; j < stopx; ++j){
+    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k){
+      for (int k = starty; k < stopy; ++k) {
        min_value = std::min(min_value, mins_rw(j, k));
        max_value = std::max(max_value, maxs_rw(j, k));
      }
    }
-    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+    voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
+                       ((max_value - min_value) < height_high_threshold);
  }
  return voxel_num;
 }
 } // namespace spconv
\ No newline at end of file
--- a/include/spconv/spconv_ops.h
+++ b/include/spconv/spconv_ops.h
@@ -30,10 +30,12 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
        std::vector<int64_t> padding, std::vector<int64_t> dilation,
-        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) {
  // auto timer = spconv::CudaContextTimer<>();
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
+  bool useHash = _useHash != 0;
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
@@ -52,13 +54,20 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
  for (int i = 1; i < outSpatialShape.size(); ++i) {
    outputVolume *= outSpatialShape[i];
  }
+  std::string msg = "due to limits of cuda hash, the volume of dense space include batch size ";
+  msg += "must less than std::numeric_limits<int>::max()";
+  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(), msg);
  torch::Tensor indicePairs =
      torch::full({kernelVolume, 2, numAct}, -1,
                   torch::dtype(torch::kInt32).device(indices.device()));
  torch::Tensor indiceNum = torch::zeros(
      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  auto gridSize = batchSize * outputVolume;
+  if (useHash){
+    gridSize = 1;
+  }
  torch::Tensor gridOut =
-      torch::full({batchSize * outputVolume}, -1,
+      torch::full({gridSize}, -1,
                  torch::dtype(torch::kInt32).device(indices.device()));
  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
  int64_t numActOut = -1;
@@ -90,14 +99,14 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
      numActOut = getIndicePairFtor(
          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          stride32, padding32, dilation32, outSpatialShape32, transpose, useHash);
    } else {
      auto getIndicePairFtor =
          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
      numActOut = getIndicePairFtor(
          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
-          stride32, padding32, dilation32, outSpatialShape32, transpose);
+          stride32, padding32, dilation32, outSpatialShape32, transpose, useHash);
    }
    return {indices, indicePairs, indiceNum};
  } else {
@@ -129,7 +138,7 @@ getIndicePair(torch::Tensor indices, int64_t batchSize,
            tv::TorchGPU(), tv::torch2tv<const int>(indices),
            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
-            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, useHash);
      }
    }
    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
@@ -142,10 +151,12 @@ getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batch
        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
        std::vector<int64_t> padding, std::vector<int64_t> dilation,
-        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose, int64_t _useHash) {
  // auto timer = spconv::CudaContextTimer<>();
  bool subM = _subM != 0;
  bool transpose = _transpose != 0;
+  bool useHash = _useHash != 0;
+  TV_ASSERT_RT_ERR(!useHash, "error");
  auto numAct = indices.size(0);
  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
  TV_ASSERT_RT_ERR(NDim == coorDim, "error");

--- a/include/tensorview/tensorview.h
+++ b/include/tensorview/tensorview.h
@@ -91,15 +91,29 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
 #define TV_CHECK_CUDA_ERR()                                                    \
  {                                                                            \
-    auto err = cudaGetLastError();                                             \
+    auto __macro_err = cudaGetLastError();                                             \
-    if (err != cudaSuccess) {                                                  \
+    if (__macro_err != cudaSuccess) {                                                  \
      std::stringstream __macro_s;                                             \
      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << err;                 \
+      __macro_s << "cuda execution failed with error " << __macro_err;                 \
      throw std::runtime_error(__macro_s.str());                               \
    }                                                                          \
  }
+#define TV_CHECK_CUDA_ERR_V2(...)                                                    \
+  {                                                                            \
+    auto __macro_err = cudaGetLastError();                                             \
+    if (__macro_err != cudaSuccess) {                                                  \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << "cuda execution failed with error " << __macro_err;                 \
+      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";\
+      tv::sstream_print(__macro_s, __VA_ARGS__); \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
 struct GPU {
  GPU(cudaStream_t s = 0) : mStream(s) {}
  virtual cudaStream_t getStream() const { return mStream; }

--- a/include/torch_utils.h
+++ b/include/torch_utils.h
@@ -58,6 +58,31 @@ template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
  }
 }
+template <typename T>
+constexpr auto type2torch(T val=T()) -> decltype(torch::kInt32){
+  TV_ASSERT_RT_ERR(false, "unknown type");
+}
+template <>
+constexpr auto type2torch(int val) -> decltype(torch::kInt32){
+  return torch::kInt32;
+}
+template <>
+constexpr auto type2torch(long val) -> decltype(torch::kInt32){
+  return torch::kInt64;
+}
+template <>
+constexpr auto type2torch(float val) -> decltype(torch::kInt32){
+  return torch::kFloat32;
+}
+template <>
+constexpr auto type2torch(double val) -> decltype(torch::kInt32){
+  return torch::kFloat64;
+}
 template <typename T>
 tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
  check_torch_dtype<T>(tensor);

--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,8 @@ class CMakeBuild(build_ext):
                      '-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
                      '-DSPCONV_BuildTests=OFF',
-                      '-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"']
+                      '-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"'
+                      ] #  -arch=sm_61
        cfg = 'Debug' if self.debug else 'Release'
        assert cfg == "Release", "pytorch ops don't support debug build."
        build_args = ['--config', cfg]

--- a/spconv/conv.py
+++ b/spconv/conv.py
@@ -49,6 +49,11 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
 class SparseConvolution(SparseModule):
+    __constants__ = [
+        'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
+        'transposed', 'output_padding', 'fused_bn'
+    ]
    def __init__(self,
                 ndim,
                 in_channels,
@@ -64,7 +69,8 @@ class SparseConvolution(SparseModule):
                 transposed=False,
                 inverse=False,
                 indice_key=None,
-                 fused_bn=False):
+                 fused_bn=False,
+                 use_hash=True):
        super(SparseConvolution, self).__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
@@ -96,6 +102,7 @@ class SparseConvolution(SparseModule):
        self.subm = subm
        self.indice_key = indice_key
        self.fused_bn = fused_bn
+        self.use_hash = use_hash
        self.weight = Parameter(
            torch.Tensor(*kernel_size, in_channels, out_channels))
@@ -167,16 +174,17 @@ class SparseConvolution(SparseModule):
                    self.output_padding,
                    self.subm,
                    self.transposed,
-                    grid=input.grid)
+                    grid=input.grid,
+                    use_hash=self.use_hash)
                input.indice_dict[self.indice_key] = (outids, indices,
                                                      indice_pairs,
                                                      indice_pair_num,
                                                      spatial_shape)
        if self.fused_bn:
            assert self.bias is not None
-            out_features = ops.fused_indice_conv(features, self.weight, self.bias, indice_pairs.to(device),
+            out_features = ops.fused_indice_conv(
-                                                indice_pair_num,
+                features, self.weight, self.bias, indice_pairs.to(device),
-                                                outids.shape[0], self.inverse, self.subm)
+                indice_pair_num, outids.shape[0], self.inverse, self.subm)
        else:
            if self.subm:
                out_features = Fsp.indice_subm_conv(features, self.weight,
@@ -185,15 +193,14 @@ class SparseConvolution(SparseModule):
                                                    outids.shape[0])
            else:
                if self.inverse:
-                    out_features = Fsp.indice_inverse_conv(features, self.weight,
+                    out_features = Fsp.indice_inverse_conv(
-                                                        indice_pairs.to(device),
+                        features, self.weight, indice_pairs.to(device),
-                                                        indice_pair_num,
+                        indice_pair_num, outids.shape[0])
-                                                        outids.shape[0])
                else:
                    out_features = Fsp.indice_conv(features, self.weight,
-                                                indice_pairs.to(device),
+                                                   indice_pairs.to(device),
-                                                indice_pair_num,
+                                                   indice_pair_num,
-                                                outids.shape[0])
+                                                   outids.shape[0])
            if self.bias is not None:
                out_features += self.bias
@@ -214,7 +221,8 @@ class SparseConv2d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SparseConv2d, self).__init__(
            2,
            in_channels,
@@ -225,7 +233,8 @@ class SparseConv2d(SparseConvolution):
            dilation,
            groups,
            bias,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SparseConv3d(SparseConvolution):
@@ -238,7 +247,8 @@ class SparseConv3d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SparseConv3d, self).__init__(
            3,
            in_channels,
@@ -249,7 +259,9 @@ class SparseConv3d(SparseConvolution):
            dilation,
            groups,
            bias,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SparseConv4d(SparseConvolution):
    def __init__(self,
@@ -261,7 +273,8 @@ class SparseConv4d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SparseConv4d, self).__init__(
            4,
            in_channels,
@@ -272,7 +285,8 @@ class SparseConv4d(SparseConvolution):
            dilation,
            groups,
            bias,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SparseConvTranspose2d(SparseConvolution):
@@ -285,7 +299,8 @@ class SparseConvTranspose2d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SparseConvTranspose2d, self).__init__(
            2,
            in_channels,
@@ -297,7 +312,8 @@ class SparseConvTranspose2d(SparseConvolution):
            groups,
            bias,
            transposed=True,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SparseConvTranspose3d(SparseConvolution):
@@ -310,7 +326,8 @@ class SparseConvTranspose3d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SparseConvTranspose3d, self).__init__(
            3,
            in_channels,
@@ -322,7 +339,8 @@ class SparseConvTranspose3d(SparseConvolution):
            groups,
            bias,
            transposed=True,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SparseInverseConv2d(SparseConvolution):
@@ -369,7 +387,8 @@ class SubMConv2d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SubMConv2d, self).__init__(
            2,
            in_channels,
@@ -381,7 +400,8 @@ class SubMConv2d(SparseConvolution):
            groups,
            bias,
            True,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SubMConv3d(SparseConvolution):
@@ -394,7 +414,8 @@ class SubMConv3d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SubMConv3d, self).__init__(
            3,
            in_channels,
@@ -406,7 +427,9 @@ class SubMConv3d(SparseConvolution):
            groups,
            bias,
            True,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)
 class SubMConv4d(SparseConvolution):
    def __init__(self,
@@ -418,7 +441,8 @@ class SubMConv4d(SparseConvolution):
                 dilation=1,
                 groups=1,
                 bias=True,
-                 indice_key=None):
+                 indice_key=None,
+                 use_hash=True):
        super(SubMConv4d, self).__init__(
            4,
            in_channels,
@@ -430,4 +454,5 @@ class SubMConv4d(SparseConvolution):
            groups,
            bias,
            True,
-            indice_key=indice_key)
+            indice_key=indice_key,
+            use_hash=use_hash)