spconv v1.1 release:

1. add cuda hash support for cuda indice generation. 2. use hash table instead of dense table in CPU code. 3. add CPU-only build support.

spconv v1.1 release:
1. add cuda hash support for cuda indice generation. 2. use hash table instead of dense table in CPU code. 3. add CPU-only build support.
a6ae8967 · traveller59 · 0757c45b · a6ae8967 · a6ae8967 · a6ae8967
Commit a6ae8967 authored May 24, 2019 by traveller59
20 changed files
--- a/include/tsl/robin_growth_policy.h
+++ b/include/tsl/robin_growth_policy.h
+/**
+ * MIT License
+ * 
+ * Copyright (c) 2017 Tessil
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TSL_ROBIN_GROWTH_POLICY_H
+#define TSL_ROBIN_GROWTH_POLICY_H 
+
+
+#include <algorithm>
+#include <array>
+#include <climits>
+#include <cmath>
+#include <cstddef>
+#include <iterator>
+#include <limits>
+#include <ratio>
+#include <stdexcept>
+
+
+#ifdef TSL_DEBUG
+#    define tsl_rh_assert(expr) assert(expr)
+#else
+#    define tsl_rh_assert(expr) (static_cast<void>(0))
+#endif
+
+
+/**
+ * If exceptions are enabled, throw the exception passed in parameter, otherwise call std::terminate.
+ */
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (defined (_MSC_VER) && defined (_CPPUNWIND))) && !defined(TSL_NO_EXCEPTIONS)
+#    define TSL_RH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
+#else
+#    ifdef NDEBUG
+#        define TSL_RH_THROW_OR_TERMINATE(ex, msg) std::terminate()
+#    else
+#        include <cstdio>
+#        define TSL_RH_THROW_OR_TERMINATE(ex, msg) do { std::fprintf(stderr, msg); std::terminate(); } while(0)
+#    endif
+#endif
+
+
+#if defined(__GNUC__) || defined(__clang__)
+#    define TSL_RH_LIKELY(exp) (__builtin_expect(!!(exp), true))
+#else
+#    define TSL_RH_LIKELY(exp) (exp)
+#endif
+
+
+namespace tsl {
+namespace rh {
+    
+/**
+ * Grow the hash table by a factor of GrowthFactor keeping the bucket count to a power of two. It allows
+ * the table to use a mask operation instead of a modulo operation to map a hash to a bucket.
+ * 
+ * GrowthFactor must be a power of two >= 2.
+ */
+template<std::size_t GrowthFactor>
+class power_of_two_growth_policy {
+public:
+    /**
+     * Called on the hash table creation and on rehash. The number of buckets for the table is passed in parameter.
+     * This number is a minimum, the policy may update this value with a higher value if needed (but not lower).
+     *
+     * If 0 is given, min_bucket_count_in_out must still be 0 after the policy creation and
+     * bucket_for_hash must always return 0 in this case.
+     */
+    explicit power_of_two_growth_policy(std::size_t& min_bucket_count_in_out) {
+        if(min_bucket_count_in_out > max_bucket_count()) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        if(min_bucket_count_in_out > 0) {
+            min_bucket_count_in_out = round_up_to_power_of_two(min_bucket_count_in_out);
+            m_mask = min_bucket_count_in_out - 1;
+        }
+        else {
+            m_mask = 0;
+        }
+    }
+    
+    /**
+     * Return the bucket [0, bucket_count()) to which the hash belongs. 
+     * If bucket_count() is 0, it must always return 0.
+     */
+    std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+        return hash & m_mask;
+    }
+    
+    /**
+     * Return the number of buckets that should be used on next growth.
+     */
+    std::size_t next_bucket_count() const {
+        if((m_mask + 1) > max_bucket_count() / GrowthFactor) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        return (m_mask + 1) * GrowthFactor;
+    }
+    
+    /**
+     * Return the maximum number of buckets supported by the policy.
+     */
+    std::size_t max_bucket_count() const {
+        // Largest power of two.
+        return (std::numeric_limits<std::size_t>::max() / 2) + 1;
+    }
+    
+    /**
+     * Reset the growth policy as if it was created with a bucket count of 0.
+     * After a clear, the policy must always return 0 when bucket_for_hash is called.
+     */
+    void clear() noexcept {
+        m_mask = 0;
+    }
+    
+private:
+    static std::size_t round_up_to_power_of_two(std::size_t value) {
+        if(is_power_of_two(value)) {
+            return value;
+        }
+        
+        if(value == 0) {
+            return 1;
+        }
+            
+        --value;
+        for(std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
+            value |= value >> i;
+        }
+        
+        return value + 1;
+    }
+    
+    static constexpr bool is_power_of_two(std::size_t value) {
+        return value != 0 && (value & (value - 1)) == 0;
+    }
+    
+protected:
+    static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2, "GrowthFactor must be a power of two >= 2.");
+    
+    std::size_t m_mask;
+};
+
+
+/**
+ * Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo to map a hash
+ * to a bucket. Slower but it can be useful if you want a slower growth.
+ */
+template<class GrowthFactor = std::ratio<3, 2>>
+class mod_growth_policy {
+public:
+    explicit mod_growth_policy(std::size_t& min_bucket_count_in_out) {
+        if(min_bucket_count_in_out > max_bucket_count()) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        if(min_bucket_count_in_out > 0) {
+            m_mod = min_bucket_count_in_out;
+        }
+        else {
+            m_mod = 1;
+        }
+    }
+    
+    std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+        return hash % m_mod;
+    }
+    
+    std::size_t next_bucket_count() const {
+        if(m_mod == max_bucket_count()) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        const double next_bucket_count = std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
+        if(!std::isnormal(next_bucket_count)) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        if(next_bucket_count > double(max_bucket_count())) {
+            return max_bucket_count();
+        }
+        else {
+            return std::size_t(next_bucket_count);
+        }
+    }
+    
+    std::size_t max_bucket_count() const {
+        return MAX_BUCKET_COUNT;
+    }
+    
+    void clear() noexcept {
+        m_mod = 1;
+    }
+    
+private:
+    static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR = 1.0 * GrowthFactor::num / GrowthFactor::den;
+    static const std::size_t MAX_BUCKET_COUNT = 
+            std::size_t(double(
+                    std::numeric_limits<std::size_t>::max() / REHASH_SIZE_MULTIPLICATION_FACTOR
+            ));
+            
+    static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1, "Growth factor should be >= 1.1.");
+    
+    std::size_t m_mod;
+};
+
+
+
+namespace detail {
+
+static constexpr const std::array<std::size_t, 40> PRIMES = {{
+    1ul, 5ul, 17ul, 29ul, 37ul, 53ul, 67ul, 79ul, 97ul, 131ul, 193ul, 257ul, 389ul, 521ul, 769ul, 1031ul, 
+    1543ul, 2053ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 
+    1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 
+    402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul
+}};
+
+template<unsigned int IPrime>
+static constexpr std::size_t mod(std::size_t hash) { return hash % PRIMES[IPrime]; }
+
+// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for faster modulo as the
+// compiler can optimize the modulo code better with a constant known at the compilation.
+static constexpr const std::array<std::size_t(*)(std::size_t), 40> MOD_PRIME = {{ 
+    &mod<0>, &mod<1>, &mod<2>, &mod<3>, &mod<4>, &mod<5>, &mod<6>, &mod<7>, &mod<8>, &mod<9>, &mod<10>, 
+    &mod<11>, &mod<12>, &mod<13>, &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>, 
+    &mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>, &mod<28>, &mod<29>, &mod<30>, 
+    &mod<31>, &mod<32>, &mod<33>, &mod<34>, &mod<35>, &mod<36>, &mod<37> , &mod<38>, &mod<39>
+}};
+
+}
+
+/**
+ * Grow the hash table by using prime numbers as bucket count. Slower than tsl::rh::power_of_two_growth_policy in  
+ * general but will probably distribute the values around better in the buckets with a poor hash function.
+ * 
+ * To allow the compiler to optimize the modulo operation, a lookup table is used with constant primes numbers.
+ * 
+ * With a switch the code would look like:
+ * \code
+ * switch(iprime) { // iprime is the current prime of the hash table
+ *     case 0: hash % 5ul;
+ *             break;
+ *     case 1: hash % 17ul;
+ *             break;
+ *     case 2: hash % 29ul;
+ *             break;
+ *     ...
+ * }    
+ * \endcode
+ * 
+ * Due to the constant variable in the modulo the compiler is able to optimize the operation
+ * by a series of multiplications, substractions and shifts. 
+ * 
+ * The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34) * 5' in a 64 bits environement.
+ */
+class prime_growth_policy {
+public:
+    explicit prime_growth_policy(std::size_t& min_bucket_count_in_out) {
+        auto it_prime = std::lower_bound(detail::PRIMES.begin(), 
+                                         detail::PRIMES.end(), min_bucket_count_in_out);
+        if(it_prime == detail::PRIMES.end()) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        m_iprime = static_cast<unsigned int>(std::distance(detail::PRIMES.begin(), it_prime));
+        if(min_bucket_count_in_out > 0) {
+            min_bucket_count_in_out = *it_prime;
+        }
+        else {
+            min_bucket_count_in_out = 0;
+        }
+    }
+    
+    std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+        return detail::MOD_PRIME[m_iprime](hash);
+    }
+    
+    std::size_t next_bucket_count() const {
+        if(m_iprime + 1 >= detail::PRIMES.size()) {
+            TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
+        }
+        
+        return detail::PRIMES[m_iprime + 1];
+    }   
+    
+    std::size_t max_bucket_count() const {
+        return detail::PRIMES.back();
+    }
+    
+    void clear() noexcept {
+        m_iprime = 0;
+    }
+    
+private:
+    unsigned int m_iprime;
+    
+    static_assert(std::numeric_limits<decltype(m_iprime)>::max() >= detail::PRIMES.size(), 
+                  "The type of m_iprime is not big enough.");
+}; 
+
+}
+}
+
+#endif
--- a/include/tsl/robin_hash.h
+++ b/include/tsl/robin_hash.h
--- a/include/tsl/robin_map.h
+++ b/include/tsl/robin_map.h
--- a/include/utility/timer.h
+++ b/include/utility/timer.h
@@ -14,11 +14,14 @@

 #pragma once
 #include <chrono>
+#ifdef SPCONV_CUDA
 #include <cuda_runtime_api.h>
+#endif
 #include <iostream>

 namespace spconv {

+#ifdef SPCONV_CUDA
 template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
  CudaContextTimer() {
    cudaDeviceSynchronize();
@@ -36,6 +39,7 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
 private:
  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
 };
+#endif

 template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }

--- a/setup.py
+++ b/setup.py
@@ -45,8 +45,16 @@ class CMakeBuild(build_ext):
                      '-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
                      '-DSPCONV_BuildTests=OFF',
-                      '-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"'
                      ] #  -arch=sm_61
+        if not torch.cuda.is_available():
+            cmake_args += ['-DSPCONV_BuildCUDA=OFF']
+        else:
+            cuda_flags = ["\"--expt-relaxed-constexpr\""]
+            # must add following flags to use at::Half
+            # but will remove raw half operators.
+            cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"]
+            cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"] 
+            cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)]
        cfg = 'Debug' if self.debug else 'Release'
        assert cfg == "Release", "pytorch ops don't support debug build."
        build_args = ['--config', cfg]

--- a/spconv/conv.py
+++ b/spconv/conv.py
@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule):
                 inverse=False,
                 indice_key=None,
                 fused_bn=False,
-                 use_hash=False):
+                 use_hash=True):
        super(SparseConvolution, self).__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
@@ -136,7 +136,6 @@ class SparseConvolution(SparseModule):
                out_spatial_shape = ops.get_conv_output_size(
                    spatial_shape, self.kernel_size, self.stride, self.padding,
                    self.dilation)
-
        else:
            out_spatial_shape = spatial_shape
        # input.update_grid(out_spatial_shape)
@@ -222,7 +221,7 @@ class SparseConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SparseConv2d, self).__init__(
            2,
            in_channels,
@@ -248,7 +247,7 @@ class SparseConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SparseConv3d, self).__init__(
            3,
            in_channels,
@@ -274,7 +273,7 @@ class SparseConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SparseConv4d, self).__init__(
            4,
            in_channels,
@@ -300,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SparseConvTranspose2d, self).__init__(
            2,
            in_channels,
@@ -327,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SparseConvTranspose3d, self).__init__(
            3,
            in_channels,
@@ -388,7 +387,7 @@ class SubMConv2d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SubMConv2d, self).__init__(
            2,
            in_channels,
@@ -415,7 +414,7 @@ class SubMConv3d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SubMConv3d, self).__init__(
            3,
            in_channels,
@@ -442,7 +441,7 @@ class SubMConv4d(SparseConvolution):
                 groups=1,
                 bias=True,
                 indice_key=None,
-                 use_hash=False):
+                 use_hash=True):
        super(SubMConv4d, self).__init__(
            4,
            in_channels,

--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -88,8 +88,10 @@ def get_indice_pairs(indices,
            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_4d
        else:
            raise NotImplementedError
-        return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
+
+        res = get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
                            stride, padding, dilation, out_padding, int(subm), int(transpose), int(use_hash))
+        return res
    else:
        if ndim == 2:
            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d

--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -15,10 +15,13 @@
 import numpy as np

 from spconv import spconv_utils
-from spconv.spconv_utils import (
-    non_max_suppression, non_max_suppression_cpu, points_to_voxel_3d_np,
+from spconv.spconv_utils import (non_max_suppression_cpu, points_to_voxel_3d_np,
    points_to_voxel_3d_np_mean, points_to_voxel_3d_with_filtering,
    rbbox_intersection, rbbox_iou, rotate_non_max_suppression_cpu)
+try:
+    from spconv.spconv_utils import non_max_suppression
+except ImportError:
+    pass


 def points_to_voxel(points,

--- a/src/cuhash/CMakeLists.txt
+++ b/src/cuhash/CMakeLists.txt
+add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu)
+
+target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
+set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
+set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
+set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
+install (TARGETS cuhash DESTINATION lib)
+
+if (SPCONV_BuildTests)
+    add_executable(cuhash_test main.cc)
+
+    target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
+    set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
+    set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
+    set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
+    install (TARGETS cuhash_test DESTINATION bin)
+endif()
\ No newline at end of file
--- a/src/hash/debugging.cpp
+++ b/src/hash/debugging.cpp
@@ -15,14 +15,14 @@
 * @brief Debugging/statistics/performance utilities for hash tables.
 */

-#include <hash/debugging.h>
-#include <hash/definitions.h>
+#include <cuhash/debugging.h>
+#include <cuhash/definitions.h>

 #include <algorithm>
 #include <cstring>
-#include <hash/cuda_util.h>
+#include <cuhash/cuda_util.h>

-namespace cudahash {
+namespace cuhash {


 void OutputRetrievalStatistics(const unsigned  n_queries,

--- a/src/hash/debugging.cu
+++ b/src/hash/debugging.cu
@@ -15,14 +15,14 @@
 * @brief Debugging/statistics/performance utilities for hash tables.
 */

-#include <hash/debugging.h>
-#include <hash/definitions.h>
-#include <hash/hash_table.cuh>
+#include <cuhash/debugging.h>
+#include <cuhash/definitions.h>
+#include <cuhash/hash_table.cuh>

 #include <algorithm>
-#include <hash/cuda_util.h>
+#include <cuhash/cuda_util.h>

-namespace cudahash {
+namespace cuhash {


 //! Debugging function: Takes statistics on the hash functions' distribution.
@@ -231,9 +231,9 @@ bool CheckAssignedSameSlot(const unsigned  N,


 void PrintStashContents(const Entry *d_stash) {
-  Entry *stash = new Entry[cudahash::kStashSize];
-  CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cudahash::kStashSize, cudaMemcpyDeviceToHost));
-  for (unsigned i = 0; i < cudahash::kStashSize; ++i) {
+  Entry *stash = new Entry[cuhash::kStashSize];
+  CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize, cudaMemcpyDeviceToHost));
+  for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
    if (get_key(stash[i]) != kKeyEmpty) {
      char buffer[256];
      sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]), get_value(stash[i]));

--- a/src/hash/hash_functions.cu
+++ b/src/hash/hash_functions.cu
-#include <hash/hash_table.h>
-#include <hash/debugging.h>
+#include <cuhash/hash_table.h>
+#include <cuhash/debugging.h>
+#include <cassert>
+#include <random>

-#include <hash/mt19937ar.h>
+namespace cuhash {
+  
+std::random_device random_dev;

-#include <cassert>
+std::mt19937 random_engine(random_dev());
+std::uniform_int_distribution<unsigned> uint_distribution;

-namespace cudahash {
+unsigned generate_random_uint32(){
+  return uint_distribution(random_engine);
+}

 void GenerateFunctions(const unsigned  N,
                       const unsigned  num_keys,
@@ -19,9 +26,11 @@ void GenerateFunctions(const unsigned  N,

    // Generate a set of hash function constants for this build attempt.
    for (unsigned i = 0 ; i < N; ++i) {
-      unsigned new_a = genrand_int32() % kPrimeDivisor;
+      // uint_distribution(random_engine) % kPrimeDivisor;
+      // genrand_int32() % kPrimeDivisor;
+      unsigned new_a = generate_random_uint32() % kPrimeDivisor;
      constants[i].x = (1 > new_a ? 1 : new_a);
-      constants[i].y = genrand_int32() % kPrimeDivisor;
+      constants[i].y = generate_random_uint32() % kPrimeDivisor;
    }

 #ifdef FORCEFULLY_GENERATE_NO_CYCLES

--- a/src/hash/hash_table.cpp
+++ b/src/hash/hash_table.cpp
@@ -14,20 +14,18 @@
 * @brief Implements a basic hash table that stores one value per key.
 */

-#include <hash/hash_table.h>
-#include <hash/debugging.h>
+#include <cuhash/hash_table.h>
+#include <cuhash/debugging.h>

 #include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <limits>
-#include <hash/mt19937ar.h>
-
 #include <cuda_runtime_api.h>
-#include <hash/cuda_util.h>
+#include <cuhash/cuda_util.h>

-namespace cudahash {
+namespace cuhash {

 char buffer[256];

@@ -164,8 +162,8 @@ bool HashTable::Build(const unsigned  n,
        else
            constants_5_.Generate(n, d_keys,table_size_);

-        stash_constants_.x = std::max(1lu, genrand_int32()) % kPrimeDivisor;
-        stash_constants_.y = genrand_int32() % kPrimeDivisor;
+        stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
+        stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
        stash_count_ = 0;

        // Initialize memory.
@@ -205,8 +203,8 @@ bool HashTable::Build(const unsigned  n,
    // Copy out the stash size.
    CUDA_SAFE_CALL(cudaMemcpy( &stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost ));
    if (stash_count_ && num_failures == 0) {
-        sprintf(buffer, "Stash size: %u", stash_count_);
-        PrintMessage(buffer, true);
+        // sprintf(buffer, "Stash size: %u", stash_count_);
+        // PrintMessage(buffer, true);

 #ifdef _DEBUG
        PrintStashContents(d_contents_ + table_size_);
@@ -226,7 +224,7 @@ bool HashTable::Build(const unsigned  n,
        sprintf(buffer, "Completely failed to build");
        PrintMessage(buffer, true);
    } else if (num_attempts > 1) {
-        sprintf(buffer, "Needed %u attempts to build", num_attempts);
+        sprintf(buffer, "Needed %u attempts to build, you can ignore this message.", num_attempts);
        PrintMessage(buffer, true);
    }


--- a/src/hash/hash_table.cu
+++ b/src/hash/hash_table.cu
@@ -14,14 +14,14 @@
 * @brief Hides all of the CUDA calls from the actual CPP file.
 */

-#include <hash/cuda_util.h>
-#include <hash/debugging.h>
-#include <hash/definitions.h>
-#include <hash/hash_table.cuh>
+#include <cuhash/cuda_util.h>
+#include <cuhash/debugging.h>
+#include <cuhash/definitions.h>
+#include <cuhash/hash_table.cuh>

 #include <cuda.h>

-namespace cudahash {
+namespace cuhash {

 namespace CUDAWrapper {
    void ClearTable(const unsigned  slots_in_table,

--- a/src/hash/main.cc
+++ b/src/hash/main.cc
-#include <hash/hash_table.h>
+#include <cuhash/hash_table.h>
 #include <cuda.h>

 int main(){
-    auto table = cudahash::HashTable();
+    auto table = cuhash::HashTable();
    table.Initialize(10, 2.0);
    const int N = 10;


--- a/src/hash/CMakeLists.txt
+++ b/src/hash/CMakeLists.txt
-add_library(cudahash SHARED hash_functions.cu hash_table.cpp hash_table.cu 
-                            mt19937ar.cpp)
-
-target_include_directories(cudahash PRIVATE ${ALL_INCLUDE} )
-set_property(TARGET cudahash PROPERTY CUDA_STANDARD 14)
-set_property(TARGET cudahash PROPERTY CXX_STANDARD 14)
-set_target_properties(cudahash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(cudahash PRIVATE ${ALL_LIBS})
-install (TARGETS cudahash DESTINATION lib)
-
-add_executable(cudahash_test main.cc)
-
-target_include_directories(cudahash_test PRIVATE ${ALL_INCLUDE} )
-set_property(TARGET cudahash_test PROPERTY CUDA_STANDARD 14)
-set_property(TARGET cudahash_test PROPERTY CXX_STANDARD 14)
-set_target_properties(cudahash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(cudahash_test PRIVATE ${ALL_LIBS} cudahash)
-install (TARGETS cudahash_test DESTINATION bin)
--- a/src/hash/mt19937ar.cpp
+++ b/src/hash/mt19937ar.cpp
-/*
-   A C-program for MT19937, with initialization improved 2002/1/26.
-   Coded by Takuji Nishimura and Makoto Matsumoto.
-
-   Before using, initialize the state by using init_genrand(seed)
-   or init_by_array(init_key, key_length).
-
-   Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-     1. Redistributions of source code must retain the above copyright
-        notice, this list of conditions and the following disclaimer.
-
-     2. Redistributions in binary form must reproduce the above copyright
-        notice, this list of conditions and the following disclaimer in the
-        documentation and/or other materials provided with the distribution.
-
-     3. The names of its contributors may not be used to endorse or promote
-        products derived from this software without specific prior written
-        permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-   Any feedback is very welcome.
-   http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
-   email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
-*/
-
-#include <stdio.h>
-
-/* Period parameters */
-#define N 624
-#define M 397
-#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
-#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
-#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
-
-static unsigned long mt[N]; /* the array for the state vector  */
-static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
-
-/* initializes mt[N] with a seed */
-void init_genrand(unsigned long s)
-{
-    mt[0]= s & 0xffffffffUL;
-    for (mti=1; mti<N; mti++) {
-        mt[mti] =
-            (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
-        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
-        /* In the previous versions, MSBs of the seed affect   */
-        /* only MSBs of the array mt[].                        */
-        /* 2002/01/09 modified by Makoto Matsumoto             */
-        mt[mti] &= 0xffffffffUL;
-        /* for >32 bit machines */
-    }
-}
-
-/* initialize by an array with array-length */
-/* init_key is the array for initializing keys */
-/* key_length is its length */
-/* slight change for C++, 2004/2/26 */
-void init_by_array(unsigned long init_key[], int key_length)
-{
-    int i, j, k;
-    init_genrand(19650218UL);
-    i=1; j=0;
-    k = (N>key_length ? N : key_length);
-    for (; k; k--) {
-        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
-          + init_key[j] + j; /* non linear */
-        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
-        i++; j++;
-        if (i>=N) { mt[0] = mt[N-1]; i=1; }
-        if (j>=key_length) j=0;
-    }
-    for (k=N-1; k; k--) {
-        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
-          - i; /* non linear */
-        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
-        i++;
-        if (i>=N) { mt[0] = mt[N-1]; i=1; }
-    }
-
-    mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
-}
-
-/* generates a random number on [0,0xffffffff]-interval */
-unsigned long genrand_int32(void)
-{
-    unsigned long y;
-    static unsigned long mag01[2]={0x0UL, MATRIX_A};
-    /* mag01[x] = x * MATRIX_A  for x=0,1 */
-
-    if (mti >= N) { /* generate N words at one time */
-        int kk;
-
-        if (mti == N+1)   /* if init_genrand() has not been called, */
-            init_genrand(5489UL); /* a default initial seed is used */
-
-        for (kk=0;kk<N-M;kk++) {
-            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
-            mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
-        }
-        for (;kk<N-1;kk++) {
-            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
-            mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
-        }
-        y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
-        mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
-
-        mti = 0;
-    }
-
-    y = mt[mti++];
-
-    /* Tempering */
-    y ^= (y >> 11);
-    y ^= (y << 7) & 0x9d2c5680UL;
-    y ^= (y << 15) & 0xefc60000UL;
-    y ^= (y >> 18);
-
-    return y;
-}
-
-/* generates a random number on [0,0x7fffffff]-interval */
-long genrand_int31(void)
-{
-    return (long)(genrand_int32()>>1);
-}
-
-/* generates a random number on [0,1]-real-interval */
-double genrand_real1(void)
-{
-    return genrand_int32()*(1.0/4294967295.0);
-    /* divided by 2^32-1 */
-}
-
-/* generates a random number on [0,1)-real-interval */
-double genrand_real2(void)
-{
-    return genrand_int32()*(1.0/4294967296.0);
-    /* divided by 2^32 */
-}
-
-/* generates a random number on (0,1)-real-interval */
-double genrand_real3(void)
-{
-    return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
-    /* divided by 2^32 */
-}
-
-/* generates a random number on [0,1) with 53-bit resolution*/
-double genrand_res53(void)
-{
-    unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
-    return(a*67108864.0+b)*(1.0/9007199254740992.0);
-}
-/* These real versions are due to Isaku Wada, 2002/01/09 added */
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
-add_library(spconv SHARED all.cc indice.cc indice.cu 
-            reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc
-            pillar_scatter.cu)
+set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc)
+if (SPCONV_BuildCUDA)
+    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
+endif()
+add_library(spconv SHARED ${ALL_FILES})

 target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
 set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
 set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
 set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(spconv PRIVATE ${ALL_LIBS} cudahash)
+if (SPCONV_BuildCUDA)
+    target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash)
+else()
+    target_link_libraries(spconv PRIVATE ${ALL_LIBS})
+endif()
 install (TARGETS spconv DESTINATION lib)
--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <cuda_runtime_api.h>
 #include <spconv/pool_ops.h>
 #include <spconv/spconv_ops.h>
 #include <spconv/pillar_scatter_ops.h>
@@ -35,9 +34,9 @@ static auto registry =
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
        .op("spconv::indice_maxpool_backward_fp32",
            &spconv::indiceMaxPoolBackward<float>)
-        // .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
-        // .op("spconv::indice_maxpool_backward_half",
-        //     &spconv::indiceMaxPoolBackward<at::Half>)
+        .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
+        .op("spconv::indice_maxpool_backward_half",
+            &spconv::indiceMaxPoolBackward<at::Half>)
        .op("spconv::nms", &spconv::nonMaxSuppression<float>)
        .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
        .op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>);
\ No newline at end of file
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -22,7 +22,7 @@
 #include <tensorview/tensorview.h>
 #include <type_traits>
 #include <utility/timer.h>
-#include <hash/hash_table.h>
+#include <cuhash/hash_table.h>

 namespace spconv {
 namespace functor {
@@ -78,24 +78,28 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0)
      return 0;
-    Index numAct = indicePairUnique.dim(0) - 1;
+    // after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique
+    Index numAct = indicePairUnique.dim(0) - 1; 
    if (useHash){
-      auto table = cudahash::HashTable();
-      table.Initialize(numAct, 2.0);
-      Index *d_values = nullptr;
-      cudaMalloc((void**)&d_values, sizeof(Index) * numAct);
+      auto table = cuhash::HashTable();
+      // std::cout << "create " << numAct << " size table..." << std::endl;
+      table.Initialize(numAct, 2.0, 4);
+      unsigned *d_values = nullptr;
+      cudaMalloc((void**)&d_values, sizeof(unsigned) * numAct);
      TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-      arangeKernel<Index><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+      arangeKernel<unsigned><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
            d.getStream()>>>(d_values, numAct);
      bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()), 
-                reinterpret_cast<unsigned*>(d_values));
-      TV_ASSERT_RT_ERR(res, "err");
+                d_values);
+      cudaFree(d_values);
+      if (!res){
+        return -1; //use -1 to tell outside use CPU implementation
+      }
      assignIndiceOutKernel<Index, NDim>
          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
            d.getStream()>>>(indicesOut, numAct,
                          indicePairUnique, outSpatialShape, batchSize);
      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
-      cudaFree(d_values);
      auto tableSize = table.get_table_size();
      auto tableData = table.data();
      auto constants = table.get_constants_4();
@@ -149,8 +153,9 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
      return 0;
    // auto timer = spconv::CudaContextTimer<>();
    if (useHash){
-      auto table = cudahash::HashTable();
-      table.Initialize(numActIn, 2.0);
+      auto table = cuhash::HashTable();
+      // std::cout << "subm create " << numActIn << " size table..." << std::endl;
+      table.Initialize(numActIn, 2.0, 4);
      unsigned *d_keyvalues = nullptr;
      cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
      unsigned *d_values = d_keyvalues + numActIn;
@@ -160,8 +165,10 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
      TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
      bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues), 
                reinterpret_cast<unsigned*>(d_values));
-      TV_ASSERT_RT_ERR(res, "err");
      cudaFree(d_keyvalues);
+      if (!res){
+        return -1; //use -1 to tell outside use CPU implementation
+      }
      auto tableSize = table.get_table_size();
      auto tableData = table.data();
      auto constants = table.get_constants_4();