v1.1 alpha: add cuda hash implementation

47527b00 · traveller59 · 5df97387 · 47527b00 · 47527b00 · 47527b00
Commit 47527b00 authored May 22, 2019 by traveller59
17 changed files
--- a/spconv/ops.py
+++ b/spconv/ops.py
@@ -52,7 +52,8 @@ def get_indice_pairs(indices,
             out_padding=0,
             subm=False,
             transpose=False,
-             grid=None):
+             grid=None,
+             use_hash=True):
    ndim = indices.shape[1] - 1
    if not isinstance(ksize, (list, tuple)):
        ksize = [ksize] * ndim
@@ -88,7 +89,7 @@ def get_indice_pairs(indices,
        else:
            raise NotImplementedError
        return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
-                            stride, padding, dilation, out_padding, int(subm), int(transpose))
+                            stride, padding, dilation, out_padding, int(subm), int(transpose), int(use_hash))
    else:
        if ndim == 2:
            get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
@@ -97,7 +98,7 @@ def get_indice_pairs(indices,
        else:
            raise NotImplementedError
        return get_indice_pairs_func(indices, grid, batch_size, out_shape, spatial_shape, ksize,
-                            stride, padding, dilation, out_padding, int(subm), int(transpose))
+                            stride, padding, dilation, out_padding, int(subm), int(transpose), int(use_hash))



@@ -177,3 +178,11 @@ def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
    res = torch.ops.spconv.nms(
        boxes, scores, pre_max_size, post_max_size, thresh, eps)
    return res
+
+def pillar_scatter(features, coors, shape):
+    if features.dtype == torch.float32:
+        return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
+    elif features.dtype == torch.half:
+        return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
+    else:
+        raise NotImplementedError
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -13,12 +13,12 @@
 # limitations under the License.

 import numpy as np
+
 from spconv import spconv_utils
 from spconv.spconv_utils import (
    non_max_suppression, non_max_suppression_cpu, points_to_voxel_3d_np,
-    rbbox_iou, points_to_voxel_3d_np_mean, points_to_voxel_3d_np_height,
-    points_to_voxel_3d_with_filtering, rotate_non_max_suppression_cpu,
-    rbbox_intersection)
+    points_to_voxel_3d_np_mean, points_to_voxel_3d_with_filtering,
+    rbbox_intersection, rbbox_iou, rotate_non_max_suppression_cpu)


 def points_to_voxel(points,
@@ -28,11 +28,11 @@ def points_to_voxel(points,
                    max_points=35,
                    max_voxels=20000,
                    full_mean=False,
-                    with_height=False,
                    block_filtering=True,
                    block_factor=1,
                    block_size=8,
                    height_threshold=0.2,
+                    height_high_threshold=3.0,
                    pad_output=False):
    """convert 3d points(N, >=3) to voxels. This version calculate
    everything in one loop. now it takes only 0.8ms(~6k voxels) 
@@ -51,7 +51,6 @@ def points_to_voxel(points,
            before call this function because max_voxels may drop some points.
        full_mean: bool. if true, all empty points in voxel will be filled with mean
            of exist points.
-        with_height: bool. don't use this.
        block_filtering: filter voxels by height. used for lidar point cloud.
            use some visualization tool to see filtered result.
    Returns:
@@ -71,67 +70,63 @@ def points_to_voxel(points,
    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
    voxels = np.zeros(
        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+    voxel_point_mask = np.zeros(
+        shape=(max_voxels, max_points), dtype=points.dtype)
    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
    res = {
        "voxels": voxels,
        "coordinates": coors,
        "num_points_per_voxel": num_points_per_voxel,
+        "voxel_point_mask": voxel_point_mask,
    }
    if full_mean:
        means = np.zeros(
            shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
        voxel_num = points_to_voxel_3d_np_mean(
-            points, voxels, means, coors,
+            points, voxels, voxel_point_mask, means, coors,
            num_points_per_voxel, coor_to_voxelidx, voxel_size.tolist(),
            coors_range.tolist(), max_points, max_voxels)
    else:
-        if with_height:
-            heights = np.zeros(
-                shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
-            maxs = np.zeros(
-                shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
-            res["heights"] = heights
-            voxel_num = points_to_voxel_3d_np_height(
-                points, voxels, heights, maxs, coors,
+        if block_filtering:
+            block_shape = [*voxelmap_shape[1:]]
+            block_shape = [b // block_factor for b in block_shape]
+            mins = np.full(block_shape, 99999999, dtype=points.dtype)
+            maxs = np.full(block_shape, -99999999, dtype=points.dtype)
+            voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
+            voxel_num = points_to_voxel_3d_with_filtering(
+                points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
+                coors, num_points_per_voxel, coor_to_voxelidx,
+                voxel_size.tolist(), coors_range.tolist(), max_points,
+                max_voxels, block_factor, block_size, height_threshold,
+                height_high_threshold)
+            voxel_mask = voxel_mask.astype(np.bool_)
+            coors_ = coors[voxel_mask]
+            if pad_output:
+                res["coordinates"][:voxel_num] = coors_
+                res["voxels"][:voxel_num] = voxels[voxel_mask]
+                res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
+                    voxel_mask]
+
+                res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
+                    voxel_mask]
+                res["coordinates"][voxel_num:] = 0
+                res["voxels"][voxel_num:] = 0
+                res["num_points_per_voxel"][voxel_num:] = 0
+                res["voxel_point_mask"][voxel_num:] = 0
+            else:
+                res["coordinates"] = coors_
+                res["voxels"] = voxels[voxel_mask]
+                res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
+                res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
+            voxel_num = coors_.shape[0]
+        else:
+            voxel_num = points_to_voxel_3d_np(
+                points, voxels, voxel_point_mask, coors,
                num_points_per_voxel, coor_to_voxelidx, voxel_size.tolist(),
                coors_range.tolist(), max_points, max_voxels)
-        else:
-            if block_filtering:
-                block_shape = [*voxelmap_shape[1:]]
-                block_shape = [b // block_factor for b in block_shape]
-                mins = np.full(block_shape, 99999999, dtype=points.dtype)
-                maxs = np.full(block_shape, -99999999, dtype=points.dtype)
-                voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
-                voxel_num = points_to_voxel_3d_with_filtering(
-                    points, voxels, voxel_mask, mins, maxs,
-                    coors, num_points_per_voxel, coor_to_voxelidx,
-                    voxel_size.tolist(), coors_range.tolist(), max_points,
-                    max_voxels, block_factor, block_size, height_threshold)
-                voxel_mask = voxel_mask.astype(np.bool_)
-                coors_ = coors[voxel_mask]
-                if pad_output:
-                    res["coordinates"][:voxel_num] = coors_
-                    res["voxels"][:voxel_num] = voxels[voxel_mask]
-                    res["num_points_per_voxel"][:
-                                                voxel_num] = num_points_per_voxel[
-                                                    voxel_mask]
-                    res["coordinates"][voxel_num:] = 0
-                    res["voxels"][voxel_num:] = 0
-                    res["num_points_per_voxel"][voxel_num:] = 0
-                else:
-                    res["coordinates"] = coors_
-                    res["voxels"] = voxels[voxel_mask]
-                    res["num_points_per_voxel"] = num_points_per_voxel[
-                        voxel_mask]
-                voxel_num = coors_.shape[0]
-            else:
-                voxel_num = points_to_voxel_3d_np(points, voxels, coors,
-                                                  num_points_per_voxel,
-                                                  coor_to_voxelidx,
-                                                  voxel_size.tolist(),
-                                                  coors_range.tolist(),
-                                                  max_points, max_voxels)
    res["voxel_num"] = voxel_num
+    res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
+        -1, max_points, 1)
    return res


@@ -209,12 +204,11 @@ class VoxelGeneratorV2:
                 max_num_points,
                 max_voxels=20000,
                 full_mean=False,
-                 with_height=False,
                 block_filtering=False,
                 block_factor=8,
                 block_size=3,
-                 height_threshold=0.1):
-        assert with_height is False, "don't use this."
+                 height_threshold=0.1,
+                 height_high_threshold=2.0):
        assert full_mean is False, "don't use this."
        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
        # [0, -40, -3, 70.4, 40, 1]
@@ -236,19 +230,19 @@ class VoxelGeneratorV2:
        self._max_voxels = max_voxels
        self._grid_size = grid_size
        self._full_mean = full_mean
-        self._with_height = with_height
        self._block_filtering = block_filtering
        self._block_factor = block_factor
        self._height_threshold = height_threshold
        self._block_size = block_size
+        self._height_high_threshold = height_high_threshold

    def generate(self, points, max_voxels=None):
        res = points_to_voxel(
            points, self._voxel_size, self._point_cloud_range,
            self._coor_to_voxelidx, self._max_num_points, max_voxels
-            or self._max_voxels, self._full_mean, self._with_height,
-            self._block_filtering, self._block_factor, self._block_size,
-            self._height_threshold)
+            or self._max_voxels, self._full_mean, self._block_filtering,
+            self._block_factor, self._block_size, self._height_threshold,
+            self._height_high_threshold)
        for k, v in res.items():
            if k != "voxel_num":
                res[k] = v[:res["voxel_num"]]
@@ -263,11 +257,11 @@ class VoxelGeneratorV2:
            self._max_num_points,
            max_voxels or self._max_voxels,
            self._full_mean,
-            self._with_height,
            self._block_filtering,
            self._block_factor,
            self._block_size,
            self._height_threshold,
+            self._height_high_threshold,
            pad_output=True)
        return res

@@ -285,4 +279,4 @@ class VoxelGeneratorV2:

    @property
    def grid_size(self):
-        return self._grid_size
\ No newline at end of file
+        return self._grid_size
--- a/src/hash/CMakeLists.txt
+++ b/src/hash/CMakeLists.txt
+add_library(cudahash SHARED hash_functions.cu hash_table.cpp hash_table.cu 
+                            mt19937ar.cpp)
+
+target_include_directories(cudahash PRIVATE ${ALL_INCLUDE} )
+set_property(TARGET cudahash PROPERTY CUDA_STANDARD 14)
+set_property(TARGET cudahash PROPERTY CXX_STANDARD 14)
+set_target_properties(cudahash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(cudahash PRIVATE ${ALL_LIBS})
+install (TARGETS cudahash DESTINATION lib)
+
+add_executable(cudahash_test main.cc)
+
+target_include_directories(cudahash_test PRIVATE ${ALL_INCLUDE} )
+set_property(TARGET cudahash_test PROPERTY CUDA_STANDARD 14)
+set_property(TARGET cudahash_test PROPERTY CXX_STANDARD 14)
+set_target_properties(cudahash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_link_libraries(cudahash_test PRIVATE ${ALL_LIBS} cudahash)
+install (TARGETS cudahash_test DESTINATION bin)
--- a/src/hash/debugging.cpp
+++ b/src/hash/debugging.cpp
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * debugging.cpp
+ *
+ * @brief Debugging/statistics/performance utilities for hash tables.
+ */
+
+#include <hash/debugging.h>
+#include <hash/definitions.h>
+
+#include <algorithm>
+#include <cstring>
+#include <hash/cuda_util.h>
+
+namespace cudahash {
+
+
+void OutputRetrievalStatistics(const unsigned  n_queries,
+                               const unsigned *d_retrieval_probes,
+                               const unsigned  n_functions)
+{
+  unsigned *retrieval_probes = new unsigned[n_queries];
+  CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes,
+                            d_retrieval_probes,
+                            sizeof(unsigned) * n_queries,
+                            cudaMemcpyDeviceToHost));
+
+  // Create a histogram showing how many items needed how many probes to be found.
+  unsigned possible_probes = n_functions + 2;
+  unsigned *histogram = new unsigned[possible_probes];
+  memset(histogram, 0, sizeof(unsigned) * (possible_probes));
+  for (unsigned i = 0; i < n_queries; ++i) {
+    histogram[retrieval_probes[i]]++;
+  }
+
+  // Dump it.
+  char buffer[10000];
+  sprintf(buffer, "Probes for retrieval: ");
+  PrintMessage(buffer);
+  for (unsigned i = 0; i < possible_probes; ++i) {
+    sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
+    PrintMessage(buffer);
+  }
+  delete [] retrieval_probes;
+  delete [] histogram;
+}
+
+
+void OutputBuildStatistics(const unsigned  n,
+                           const unsigned *d_iterations_taken) {
+  // Output how many iterations each thread took until it found an empty slot.
+  unsigned *iterations_taken = new unsigned[n];
+  CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken, sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
+  std::sort(iterations_taken, iterations_taken + n);
+  unsigned total_iterations = 0;
+  unsigned max_iterations_taken = 0;
+  for (unsigned i = 0; i < n; ++i) {
+    total_iterations += iterations_taken[i];
+    max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
+  }
+
+  unsigned current_value = iterations_taken[0];
+  unsigned count = 1;
+  char buffer[10000];
+  sprintf(buffer, "Iterations taken:\n");
+  for (unsigned i = 1; i < n; ++i) {
+    if (iterations_taken[i] != current_value) {
+      sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
+      current_value = iterations_taken[i];
+      count = 1;
+    } else {
+      count++;
+    }
+  }
+  sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
+  PrintMessage(buffer);
+  sprintf(buffer, "Total iterations: %u", total_iterations);
+  PrintMessage(buffer);
+  sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)", (float)total_iterations / n, iterations_taken[n/2], iterations_taken[n-1]);
+  PrintMessage(buffer);
+  delete [] iterations_taken;
+
+  // Print the length of the longest eviction chain.
+  sprintf(buffer, "Max iterations: %u", max_iterations_taken);
+  PrintMessage(buffer);
+}
+
+
+}; // namespace CuckooHashing
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/src/hash/debugging.cu
+++ b/src/hash/debugging.cu
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * debugging.cu
+ *
+ * @brief Debugging/statistics/performance utilities for hash tables.
+ */
+
+#include <hash/debugging.h>
+#include <hash/definitions.h>
+#include <hash/hash_table.cuh>
+
+#include <algorithm>
+#include <hash/cuda_util.h>
+
+namespace cudahash {
+
+
+//! Debugging function: Takes statistics on the hash functions' distribution.
+/*! Determines:
+ *    - How many unique slots each key has.
+ *    - How many keys hash into each slot.
+ *    - Whether any keys failed to get a full set of slots.
+ */
+__global__
+void take_hash_function_statistics_kernel(const unsigned  *keys,
+                                          const unsigned   n_entries,
+                                          const unsigned   table_size,
+                                          const uint2     *constants,
+                                          const unsigned   num_functions,
+                                                unsigned  *num_slots_available,
+                                                unsigned  *num_hashing_in,
+                                                unsigned  *failed) {
+  unsigned thread_index = threadIdx.x +
+                          blockIdx.x * blockDim.x +
+                          blockIdx.y * blockDim.x * gridDim.x;
+
+  if (thread_index >= n_entries)
+    return;
+  unsigned key = keys[thread_index];
+
+  // Determine all of the locations the key hashes into.
+  // Also count how many keys hash into each location.
+  unsigned locations[kMaxHashFunctions];
+  for (unsigned i = 0; i < num_functions; ++i) {
+    locations[i] = hash_function_inner(constants[i], key) % table_size;
+
+    if (num_hashing_in != NULL) {
+      atomicAdd(num_hashing_in + locations[i], 1);
+    }
+  }
+
+  // Determine whether all of the locations were different.
+  unsigned num_slots = 1;
+  for (unsigned i = 1; i < num_functions; ++i) {
+    bool matched = false;
+    for (unsigned j = 0; j < i; ++j) {
+      if (locations[i] == locations[j]) {
+        matched = true;
+        break;
+      }
+    }
+    if (!matched) {
+      num_slots++;
+    }
+  }
+
+  if (num_slots_available != NULL) {
+    num_slots_available[thread_index] = num_slots;
+  }
+
+  if (failed != NULL && num_slots != num_functions) {
+    *failed = 1;
+  }
+}
+
+
+void TakeHashFunctionStatistics(const unsigned   num_keys,
+                                const unsigned  *d_keys,
+                                const unsigned   table_size,
+                                const uint2     *constants,
+                                const unsigned   kNumHashFunctions) {
+  char buffer[16000];
+  PrintMessage("Hash function constants: ");
+
+  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
+    sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
+    PrintMessage(buffer);
+  }
+
+  unsigned *d_num_hashing_in = NULL;
+ #ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
+  CUDA_SAFE_CALL(cudaMalloc((void**)&d_num_hashing_in,
+                             sizeof(unsigned) * table_size));
+  CUDA_SAFE_CALL(cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
+ #endif
+
+  unsigned *d_num_slots_available = NULL;
+ #ifdef COUNT_HOW_MANY_HAVE_CYCLES
+  CUDA_SAFE_CALL(cudaMalloc((void**)&d_num_slots_available,
+                            sizeof(unsigned) * num_keys));
+ #endif
+  uint2 *d_constants = NULL;
+  CUDA_SAFE_CALL(cudaMalloc((void**)&d_constants, sizeof(uint2) * kNumHashFunctions));
+  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * kNumHashFunctions, cudaMemcpyHostToDevice));
+
+  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys), kBlockSize>>>
+                                      (d_keys, num_keys,
+                                       table_size,
+                                       d_constants,
+                                       kNumHashFunctions,
+                                       d_num_slots_available,
+                                       d_num_hashing_in,
+                                       NULL);
+  CUDA_SAFE_CALL(cudaFree(d_constants));
+
+ #ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
+  unsigned *num_hashing_in = new unsigned[table_size];
+  CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in,
+                            d_num_hashing_in,
+                            sizeof(unsigned) * table_size,
+                            cudaMemcpyDeviceToHost));
+
+  /*
+  // Print how many items hash into each slot.
+  // Used to make sure items are spread evenly throughout the table.
+  buffer[0] = '\0';
+  PrintMessage("Num hashing into each: ", true);
+  for (unsigned i = 0; i < table_size; ++i) {
+    sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
+    if (i % 25 == 24) {
+      PrintMessage(buffer, true);
+      buffer[0] = '\0';
+    }
+  }
+  PrintMessage(buffer,true);
+  */
+
+  // Print a histogram of how many items are hashed into each slot.  Shows
+  // if average number of items hashing into each slot is low.
+  std::sort(num_hashing_in, num_hashing_in + table_size);
+  int count = 1;
+  unsigned previous = num_hashing_in[0];
+  sprintf(buffer, "Num items hashing into a slot:\t");
+  PrintMessage(buffer);
+  for (unsigned i = 1; i < table_size; ++i) {
+    if (num_hashing_in[i] != previous) {
+      sprintf(buffer, "\t(%u, %u)", previous, count);
+      PrintMessage(buffer);
+      previous = num_hashing_in[i];
+      count = 1;
+    } else {
+      count++;
+    }
+  }
+  sprintf(buffer, "\t(%u, %u)", previous, count);
+  PrintMessage(buffer);
+
+  delete [] num_hashing_in;
+  CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
+ #endif
+
+ #ifdef COUNT_HOW_MANY_HAVE_CYCLES
+  unsigned *num_slots_available = new unsigned[num_keys];
+  CUDA_SAFE_CALL(cudaMemcpy(num_slots_available,
+                            d_num_slots_available,
+                            sizeof(unsigned) * num_keys,
+                            cudaMemcpyDeviceToHost));
+
+  static const unsigned kHistogramSize = kNumHashFunctions + 1;
+  unsigned *histogram = new unsigned[kHistogramSize];
+  memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
+  for (unsigned i = 0; i < num_keys; ++i) {
+    histogram[num_slots_available[i]]++;
+  }
+
+  sprintf(buffer, "Slots assigned to each key: ");
+  for (unsigned i = 1; i < kHistogramSize; ++i) {
+    sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
+  }
+  PrintMessage(buffer);
+
+  delete [] histogram;
+  delete [] num_slots_available;
+  CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
+ #endif
+}
+
+bool CheckAssignedSameSlot(const unsigned  N,
+                           const unsigned  num_keys,
+                           const unsigned *d_keys,
+                           const unsigned  table_size,
+                                 uint2    *constants) {
+  unsigned *d_cycle_exists = NULL;
+  uint2    *d_constants    = NULL;
+
+  CUDA_SAFE_CALL(cudaMalloc((void**)&d_cycle_exists, sizeof(unsigned)));
+  CUDA_SAFE_CALL(cudaMalloc((void**)&d_constants, sizeof(uint2) * N));
+
+  CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
+  CUDA_SAFE_CALL(cudaMemcpy(d_constants,
+                            constants,
+                            sizeof(uint2) * N,
+                            cudaMemcpyHostToDevice));
+
+  // Check if all keys were given a full set of N slots by the functions.
+  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys), kBlockSize>>>
+                                      (d_keys, num_keys, table_size, d_constants, N,
+                                       NULL, NULL, d_cycle_exists);
+
+  unsigned cycle_exists;
+  CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists,
+                            d_cycle_exists,
+                            sizeof(unsigned),
+                            cudaMemcpyDeviceToHost));
+
+  CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
+  CUDA_SAFE_CALL(cudaFree(d_constants));
+
+  return (cycle_exists != 0);
+}
+
+
+void PrintStashContents(const Entry *d_stash) {
+  Entry *stash = new Entry[cudahash::kStashSize];
+  CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cudahash::kStashSize, cudaMemcpyDeviceToHost));
+  for (unsigned i = 0; i < cudahash::kStashSize; ++i) {
+    if (get_key(stash[i]) != kKeyEmpty) {
+      char buffer[256];
+      sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]), get_value(stash[i]));
+      PrintMessage(buffer, true);
+    }
+  }
+  delete [] stash;
+}
+
+
+}; // namespace CuckooHashing
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/src/hash/hash_functions.cu
+++ b/src/hash/hash_functions.cu
+#include <hash/hash_table.h>
+#include <hash/debugging.h>
+
+#include <hash/mt19937ar.h>
+
+#include <cassert>
+
+namespace cudahash {
+
+void GenerateFunctions(const unsigned  N,
+                       const unsigned  num_keys,
+                       const unsigned *d_keys,
+                       const unsigned  table_size,
+                             uint2    *constants) {
+  bool regenerate = true;
+
+  while (regenerate) {
+    regenerate = false;
+
+    // Generate a set of hash function constants for this build attempt.
+    for (unsigned i = 0 ; i < N; ++i) {
+      unsigned new_a = genrand_int32() % kPrimeDivisor;
+      constants[i].x = (1 > new_a ? 1 : new_a);
+      constants[i].y = genrand_int32() % kPrimeDivisor;
+    }
+
+#ifdef FORCEFULLY_GENERATE_NO_CYCLES
+    // Ensure that every key gets N different slots.
+    regenerate = CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
+#endif
+  }
+
+
+#ifdef TAKE_HASH_FUNCTION_STATISTICS
+  // Examine how well distributed the items are.
+  TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
+#endif
+}
+
+}; // namespace CuckooHashing
--- a/src/hash/hash_table.cpp
+++ b/src/hash/hash_table.cpp
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file hash_table.cpp
+ *
+ * @brief Implements a basic hash table that stores one value per key.
+ */
+
+#include <hash/hash_table.h>
+#include <hash/debugging.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <limits>
+#include <hash/mt19937ar.h>
+
+#include <cuda_runtime_api.h>
+#include <hash/cuda_util.h>
+
+namespace cudahash {
+
+char buffer[256];
+
+//! @name Internal
+/// @{
+dim3 ComputeGridDim(unsigned n) {
+    // Round up in order to make sure all items are hashed in.
+    dim3 grid( (n + kBlockSize-1) / kBlockSize );
+    if (grid.x > kGridSize) {
+        grid.y = (grid.x + kGridSize - 1) / kGridSize;
+        grid.x = kGridSize;
+    }
+    return grid;
+}
+
+
+unsigned ComputeMaxIterations(const unsigned n,
+                              const unsigned table_size,
+                              const unsigned num_functions) {
+    float lg_input_size = (float)(log((double)n) / log(2.0));
+
+// #define CONSTANT_ITERATIONS
+#ifdef CONSTANT_ITERATIONS
+    // Set the maximum number of iterations to 7lg(N).
+    const unsigned MAX_ITERATION_CONSTANT = 7;
+    unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
+#else
+    // Use an empirical formula for determining what the maximum number of
+    // iterations should be.  Works OK in most situations.
+    float load_factor = float(n) / table_size;
+    float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
+
+    unsigned max_iterations = (unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 * 
+                                               ln_load_factor)* lg_input_size));
+#endif
+    return max_iterations;
+}
+/// @}
+
+
+HashTable::HashTable() : table_size_(0),
+                         d_contents_(NULL),
+                         stash_count_(0),
+                         d_failures_(NULL) {
+    CUDA_CHECK_ERROR("Failed in constructor.\n");                         
+}                         
+
+
+bool HashTable::Initialize(const unsigned max_table_entries,
+                           const float    space_usage,
+                           const unsigned num_functions) {
+    Release();
+
+    // Determine the minimum amount of slots the table requires,
+    // and whether the space_usage is within range.
+    float minimum_space_usage;
+    if (num_functions < 2 || num_functions > 5) {
+        char message[256] = "Number of hash functions must be from 2 to 5; "
+            "others are unimplemented.";
+        PrintMessage(message, true);
+        return false;
+    } else {
+        minimum_space_usage = kMinimumSpaceUsages[num_functions];
+    }
+
+    if (space_usage < minimum_space_usage) {
+        sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
+                num_functions, minimum_space_usage);
+        PrintMessage(buffer);
+        return false;
+    }
+
+    num_hash_functions_ = num_functions;
+    table_size_ = unsigned(ceil(max_table_entries * space_usage));
+
+    // Allocate memory.
+    const unsigned slots_to_allocate = table_size_ + kStashSize;
+    CUDA_SAFE_CALL(cudaMalloc( (void**)&d_contents_, 
+                               sizeof(Entry) * slots_to_allocate ));
+    CUDA_SAFE_CALL(cudaMalloc( (void**)&d_failures_, sizeof(unsigned) ));
+    if (!d_contents_ || !d_failures_) {
+        fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
+        return false;
+    }
+    CUDA_CHECK_ERROR("Failed to initialize.\n");
+
+    return true;
+}
+
+
+void HashTable::Release() {
+    table_size_  = 0;
+
+    CUDA_SAFE_CALL(cudaFree(d_contents_));
+    CUDA_SAFE_CALL(cudaFree(d_failures_));
+
+    d_contents_ = NULL;
+    d_failures_ = NULL;
+
+    CUDA_CHECK_ERROR("Failed during release.\n");
+}
+
+
+bool HashTable::Build(const unsigned  n,
+                      const unsigned *d_keys,
+                      const unsigned *d_values) {
+    unsigned max_iterations = ComputeMaxIterations(n, table_size_, 
+                                                   num_hash_functions_);
+    unsigned num_failures = 1;
+    unsigned num_attempts = 0;
+
+    // Storage for statistics collection.
+    unsigned *d_iterations_taken = NULL;
+#ifdef TRACK_ITERATIONS
+    CUDA_SAFE_CALL(cudaMalloc((void**)&d_iterations_taken, sizeof(unsigned) * n));
+#endif
+
+    // Track how many items ended up in the stash.
+    unsigned *d_stash_count = NULL;
+    CUDA_SAFE_CALL(cudaMalloc((void**)&d_stash_count, sizeof(unsigned)));
+    CUDA_CHECK_ERROR("Failed before main build loop.\n");
+
+    // Main build loop.
+    while (num_failures && ++num_attempts < kMaxRestartAttempts) {
+        CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
+    
+        // Generate new hash functions.
+        if (num_hash_functions_ == 2)
+            constants_2_.Generate(n, d_keys,table_size_);
+        else if (num_hash_functions_ == 3)
+            constants_3_.Generate(n, d_keys,table_size_);
+        else if (num_hash_functions_ == 4)
+            constants_4_.Generate(n, d_keys,table_size_);
+        else
+            constants_5_.Generate(n, d_keys,table_size_);
+
+        stash_constants_.x = std::max(1lu, genrand_int32()) % kPrimeDivisor;
+        stash_constants_.y = genrand_int32() % kPrimeDivisor;
+        stash_count_ = 0;
+
+        // Initialize memory.
+        unsigned slots_in_table = table_size_ + kStashSize;
+        CUDAWrapper::ClearTable(slots_in_table,
+                                kEntryEmpty,
+                                d_contents_);
+
+        num_failures = 0;
+
+        CUDAWrapper::CallCuckooHash(n,
+                                    num_hash_functions_,
+                                    d_keys,
+                                    d_values,
+                                    table_size_,
+                                    constants_2_,
+                                    constants_3_,
+                                    constants_4_,
+                                    constants_5_,
+                                    max_iterations,
+                                    d_contents_,
+                                    stash_constants_,
+                                    d_stash_count,
+                                    d_failures_,
+                                    d_iterations_taken);
+             
+        // Check if successful.
+        CUDA_SAFE_CALL(cudaMemcpy( &num_failures, d_failures_, sizeof(unsigned), cudaMemcpyDeviceToHost ));
+
+#ifdef COUNT_UNINSERTED
+        if (num_failures) {
+            printf("Failed to insert %u items.\n", num_failures);
+        }
+#endif
+    }
+
+    // Copy out the stash size.
+    CUDA_SAFE_CALL(cudaMemcpy( &stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost ));
+    if (stash_count_ && num_failures == 0) {
+        sprintf(buffer, "Stash size: %u", stash_count_);
+        PrintMessage(buffer, true);
+
+#ifdef _DEBUG
+        PrintStashContents(d_contents_ + table_size_);
+#endif    
+    }
+    CUDA_SAFE_CALL(cudaFree(d_stash_count));
+
+#ifdef TRACK_ITERATIONS
+    if (num_failures == 0) {
+        OutputBuildStatistics(n, d_iterations_taken);
+    }
+    CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
+#endif
+
+    // Dump some info if a restart was required.
+    if (num_attempts >= kMaxRestartAttempts) {
+        sprintf(buffer, "Completely failed to build");
+        PrintMessage(buffer, true);
+    } else if (num_attempts > 1) {
+        sprintf(buffer, "Needed %u attempts to build", num_attempts);
+        PrintMessage(buffer, true);
+    }
+
+    CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
+    return num_failures == 0;
+}
+
+
+void HashTable::Retrieve(const unsigned  n_queries,
+                         const unsigned *d_keys,
+                         unsigned *d_values) {
+    CUDAWrapper::CallHashRetrieve(n_queries,
+                                  num_hash_functions_,
+                                  d_keys,
+                                  table_size_,
+                                  d_contents_,
+                                  constants_2_,
+                                  constants_3_,
+                                  constants_4_,
+                                  constants_5_,
+                                  stash_constants_,
+                                  stash_count_,
+                                  d_values);
+}
+
+
+};  // namesapce CuckooHashing
+
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/src/hash/hash_table.cu
+++ b/src/hash/hash_table.cu
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision:$
+// $Date:$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file hash_table.cu
+ *
+ * @brief Hides all of the CUDA calls from the actual CPP file.
+ */
+
+#include <hash/cuda_util.h>
+#include <hash/debugging.h>
+#include <hash/definitions.h>
+#include <hash/hash_table.cuh>
+
+#include <cuda.h>
+
+namespace cudahash {
+
+namespace CUDAWrapper {
+    void ClearTable(const unsigned  slots_in_table,
+                    const Entry     fill_value,
+                          Entry    *d_contents) {
+        clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>
+            (slots_in_table, fill_value, d_contents);
+        TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
+    }
+
+    void CallCuckooHash(const unsigned      n,
+                        const unsigned      num_hash_functions,
+                        const unsigned     *d_keys,
+                        const unsigned     *d_values,
+                        const unsigned      table_size,
+                        const Functions<2>  constants_2,
+                        const Functions<3>  constants_3,
+                        const Functions<4>  constants_4,
+                        const Functions<5>  constants_5,
+                        const unsigned      max_iterations,
+                              Entry        *d_contents,
+                              uint2         stash_constants,
+                              unsigned     *d_stash_count,
+                              unsigned     *d_failures,
+                              unsigned     *d_iterations_taken) {
+        // Build the table.
+        cudaMemset(d_failures, 0, sizeof(unsigned));
+        if (num_hash_functions == 2) {
+            CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
+                (n,
+                 d_keys,
+                 d_values,
+                 table_size,
+                 constants_2,
+                 max_iterations,
+                 d_contents,
+                 stash_constants,
+                 d_stash_count,
+                 d_failures,
+                 d_iterations_taken);
+        } else if (num_hash_functions == 3) {
+            CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
+                (n,
+                 d_keys,
+                 d_values,
+                 table_size,
+                 constants_3,
+                 max_iterations,
+                 d_contents,
+                 stash_constants,
+                 d_stash_count,
+                 d_failures,
+                 d_iterations_taken);
+        } else if (num_hash_functions == 4) {
+            CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
+                (n,
+                 d_keys,
+                 d_values,
+                 table_size,
+                 constants_4,
+                 max_iterations,
+                 d_contents,
+                 stash_constants,
+                 d_stash_count,
+                 d_failures,
+                 d_iterations_taken);
+        } else {
+            CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
+                (n,
+                 d_keys,
+                 d_values,
+                 table_size,
+                 constants_5,
+                 max_iterations,
+                 d_contents,
+                 stash_constants,
+                 d_stash_count,
+                 d_failures,
+                 d_iterations_taken);
+        }
+    
+        CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
+    }
+
+
+    void CallHashRetrieve(const unsigned      n_queries,
+                          const unsigned      num_hash_functions,
+                          const unsigned     *d_keys,
+                          const unsigned      table_size,
+                          const Entry        *d_contents,
+                          const Functions<2>  constants_2,
+                          const Functions<3>  constants_3,
+                          const Functions<4>  constants_4,
+                          const Functions<5>  constants_5,
+                          const uint2         stash_constants,
+                          const unsigned      stash_count,
+                                unsigned     *d_values) {
+        unsigned *d_retrieval_probes = NULL;
+    #ifdef TRACK_ITERATIONS
+        CUDA_SAFE_CALL(cudaMalloc((void**)&d_retrieval_probes, sizeof(unsigned) * n_queries));
+    #endif
+    
+        if (num_hash_functions == 2) {
+            hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
+                (n_queries,
+                 d_keys,
+                 table_size,
+                 d_contents,
+                 constants_2,
+                 stash_constants,
+                 stash_count,
+                 d_values,
+                 d_retrieval_probes);
+        } else if (num_hash_functions == 3) {
+            hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
+                (n_queries,
+                 d_keys,
+                 table_size,
+                 d_contents,
+                 constants_3,
+                 stash_constants,
+                 stash_count,
+                 d_values,
+                 d_retrieval_probes);
+        } else if (num_hash_functions == 4) {
+            hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
+                (n_queries,
+                 d_keys,
+                 table_size,
+                 d_contents,
+                 constants_4,
+                 stash_constants,
+                 stash_count,
+                 d_values,
+                 d_retrieval_probes);
+        } else {
+            hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
+                (n_queries,
+                 d_keys,
+                 table_size,
+                 d_contents,
+                 constants_5,
+                 stash_constants,
+                 stash_count,
+                 d_values,
+                 d_retrieval_probes);
+        }
+      
+        CUDA_CHECK_ERROR("Retrieval failed.\n");
+    
+    #ifdef TRACK_ITERATIONS
+        OutputRetrievalStatistics(n_queries,
+                                  d_retrieval_probes,
+                                  num_hash_functions);
+        CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
+    #endif
+    }
+};  // namespace CUDAWrapper
+
+
+};  // namespace CuckooHashing
--- a/src/hash/main.cc
+++ b/src/hash/main.cc
+#include <hash/hash_table.h>
+#include <cuda.h>
+
+int main(){
+    auto table = cudahash::HashTable();
+    table.Initialize(10, 2.0);
+    const int N = 10;
+
+    // ハッシュテーブルに格納するデータ
+    int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
+    int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+    // デバイスメモリにコピー
+    int *d_keys, *d_vals;
+    cudaMalloc((void**)&d_keys, sizeof(int) * N);
+    cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_vals, sizeof(int) * N);
+    cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
+
+    // ハッシュテーブルにクエリするデータ
+    int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    int output[N];
+
+    // デバイスメモリにコピー
+    int *d_input, *d_output;
+    cudaMalloc((void**)&d_input, sizeof(int) * N);
+    cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
+    cudaMalloc((void**)&d_output, sizeof(int) * N);
+    cudaMemset(d_output, 0, sizeof(int) * N);
+    bool s = table.Build(N, (const unsigned int *) d_keys, 
+                            (const unsigned int *) d_vals);
+
+    std::cout << s << std::endl;
+    table.Retrieve(N, (const unsigned int *) d_input,
+                            (unsigned int *) d_output);
+
+    std::cout << s << std::endl;
+    cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < N; ++i) {
+        printf("%d\n", output[i]);
+    }
+
+    return 0;
+}
\ No newline at end of file
--- a/src/hash/mt19937ar.cpp
+++ b/src/hash/mt19937ar.cpp
+/*
+   A C-program for MT19937, with initialization improved 2002/1/26.
+   Coded by Takuji Nishimura and Makoto Matsumoto.
+
+   Before using, initialize the state by using init_genrand(seed)
+   or init_by_array(init_key, key_length).
+
+   Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+     3. The names of its contributors may not be used to endorse or promote
+        products derived from this software without specific prior written
+        permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+   Any feedback is very welcome.
+   http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+   email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+*/
+
+#include <stdio.h>
+
+/* Period parameters */
+#define N 624
+#define M 397
+#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
+#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
+#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
+
+static unsigned long mt[N]; /* the array for the state vector  */
+static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned long s)
+{
+    mt[0]= s & 0xffffffffUL;
+    for (mti=1; mti<N; mti++) {
+        mt[mti] =
+            (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
+        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+        /* In the previous versions, MSBs of the seed affect   */
+        /* only MSBs of the array mt[].                        */
+        /* 2002/01/09 modified by Makoto Matsumoto             */
+        mt[mti] &= 0xffffffffUL;
+        /* for >32 bit machines */
+    }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned long init_key[], int key_length)
+{
+    int i, j, k;
+    init_genrand(19650218UL);
+    i=1; j=0;
+    k = (N>key_length ? N : key_length);
+    for (; k; k--) {
+        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+          + init_key[j] + j; /* non linear */
+        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+        i++; j++;
+        if (i>=N) { mt[0] = mt[N-1]; i=1; }
+        if (j>=key_length) j=0;
+    }
+    for (k=N-1; k; k--) {
+        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
+          - i; /* non linear */
+        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+        i++;
+        if (i>=N) { mt[0] = mt[N-1]; i=1; }
+    }
+
+    mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned long genrand_int32(void)
+{
+    unsigned long y;
+    static unsigned long mag01[2]={0x0UL, MATRIX_A};
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+    if (mti >= N) { /* generate N words at one time */
+        int kk;
+
+        if (mti == N+1)   /* if init_genrand() has not been called, */
+            init_genrand(5489UL); /* a default initial seed is used */
+
+        for (kk=0;kk<N-M;kk++) {
+            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+            mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+        }
+        for (;kk<N-1;kk++) {
+            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+            mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+        }
+        y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
+        mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+        mti = 0;
+    }
+
+    y = mt[mti++];
+
+    /* Tempering */
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680UL;
+    y ^= (y << 15) & 0xefc60000UL;
+    y ^= (y >> 18);
+
+    return y;
+}
+
+/* generates a random number on [0,0x7fffffff]-interval */
+long genrand_int31(void)
+{
+    return (long)(genrand_int32()>>1);
+}
+
+/* generates a random number on [0,1]-real-interval */
+double genrand_real1(void)
+{
+    return genrand_int32()*(1.0/4294967295.0);
+    /* divided by 2^32-1 */
+}
+
+/* generates a random number on [0,1)-real-interval */
+double genrand_real2(void)
+{
+    return genrand_int32()*(1.0/4294967296.0);
+    /* divided by 2^32 */
+}
+
+/* generates a random number on (0,1)-real-interval */
+double genrand_real3(void)
+{
+    return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
+    /* divided by 2^32 */
+}
+
+/* generates a random number on [0,1) with 53-bit resolution*/
+double genrand_res53(void)
+{
+    unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
+    return(a*67108864.0+b)*(1.0/9007199254740992.0);
+}
+/* These real versions are due to Isaku Wada, 2002/01/09 added */
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
 add_library(spconv SHARED all.cc indice.cc indice.cu 
-            reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc)
+            reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc
+            pillar_scatter.cu)

 target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
 set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
 set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
 set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(spconv PRIVATE ${ALL_LIBS})
+target_link_libraries(spconv PRIVATE ${ALL_LIBS} cudahash)
 install (TARGETS spconv DESTINATION lib)
--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -15,6 +15,7 @@
 #include <cuda_runtime_api.h>
 #include <spconv/pool_ops.h>
 #include <spconv/spconv_ops.h>
+#include <spconv/pillar_scatter_ops.h>
 #include <spconv/fused_spconv_ops.h>
 #include <spconv/nms_ops.h>

@@ -34,7 +35,9 @@ static auto registry =
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
        .op("spconv::indice_maxpool_backward_fp32",
            &spconv::indiceMaxPoolBackward<float>)
-        .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
-        .op("spconv::indice_maxpool_backward_half",
-            &spconv::indiceMaxPoolBackward<at::Half>)
-        .op("spconv::nms", &spconv::nonMaxSuppression<float>);
\ No newline at end of file
+        // .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
+        // .op("spconv::indice_maxpool_backward_half",
+        //     &spconv::indiceMaxPoolBackward<at::Half>)
+        .op("spconv::nms", &spconv::nonMaxSuppression<float>)
+        .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
+        .op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>);
\ No newline at end of file
--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
@@ -32,7 +32,7 @@ struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
                     const tv::SimpleVector<Index, NDim> padding,
                     const tv::SimpleVector<Index, NDim> dilation,
                     const tv::SimpleVector<Index, NDim> outSpatialShape,
-                     bool transpose, bool resetGrid) {
+                     bool transpose, bool resetGrid, bool useHash) {
    if (transpose)
      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
          indicesIn, indicesOut,
@@ -59,7 +59,7 @@ struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
                     const tv::SimpleVector<Index, NDim> padding,
                     const tv::SimpleVector<Index, NDim> dilation,
                     const tv::SimpleVector<Index, NDim> outSpatialShape,
-                     bool transpose, bool resetGrid) {
+                     bool transpose, bool resetGrid, bool useHash) {
    return getIndicePairsSubM<Index, IndexGrid, NDim>(
        indicesIn,
        gridsOut, indicePairs, indiceNum,

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -22,6 +22,7 @@
 #include <tensorview/tensorview.h>
 #include <type_traits>
 #include <utility/timer.h>
+#include <hash/hash_table.h>

 namespace spconv {
 namespace functor {
@@ -71,28 +72,61 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
                   tv::TensorView<Index> indiceNum,
                   tv::TensorView<Index> indicePairUnique,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
+                   bool transpose, bool resetGrid, bool useHash) {
    Index batchSize = gridsOut.dim(0);
    auto kernelVolume = indicePairs.dim(0);
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0)
      return 0;
    Index numAct = indicePairUnique.dim(0) - 1;
-    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
-                         indicePairUnique, outSpatialShape, batchSize);
-    TV_CHECK_CUDA_ERR();
-    assignIndicePairsKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
-                         indicePairUnique, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    if (resetGrid) {
+    if (useHash){
+      auto table = cudahash::HashTable();
+      table.Initialize(numAct, 2.0);
+      Index *d_values = nullptr;
+      cudaMalloc((void**)&d_values, sizeof(Index) * numAct);
+      TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
+      arangeKernel<Index><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(d_values, numAct);
+      bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()), 
+                reinterpret_cast<unsigned*>(d_values));
+      TV_ASSERT_RT_ERR(res, "err");
+      assignIndiceOutKernel<Index, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesOut, numAct,
+                          indicePairUnique, outSpatialShape, batchSize);
+      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
+      cudaFree(d_values);
+      auto tableSize = table.get_table_size();
+      auto tableData = table.data();
+      auto constants = table.get_constants_4();
+      auto stash_constants = table.get_stash_constants();
+      auto stash_count = table.get_stash_count();
+      assignIndicePairsHashKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesOut, numActIn, indicePairs,
+                          indicePairUnique,
+                          tableSize, tableData, constants, stash_constants,
+                          stash_count);
+      TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+    }else{
+      assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                          indicePairUnique, outSpatialShape, batchSize);
+      TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
+      assignIndicePairsKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                          indicePairUnique, outSpatialShape);
+      TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
+
+    }
+
+    if (resetGrid && (!useHash)) {
      resetGridKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
-      TV_CHECK_CUDA_ERR();
+      TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
    }
    return numAct;
  }
@@ -109,22 +143,50 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
                   const tv::SimpleVector<Index, NDim> padding,
                   const tv::SimpleVector<Index, NDim> dilation,
                   const tv::SimpleVector<Index, NDim> outSpatialShape,
-                   bool transpose, bool resetGrid) {
+                   bool transpose, bool resetGrid, bool useHash) {
    auto numActIn = indicesIn.dim(0);
    if (numActIn == 0)
      return 0;
    // auto timer = spconv::CudaContextTimer<>();
-    prepareSubMGridKernel<Index, IndexGrid, NDim>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
-    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
-        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
-           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
-                         kernelSize, stride, padding, dilation, outSpatialShape);
-    TV_CHECK_CUDA_ERR();
+    if (useHash){
+      auto table = cudahash::HashTable();
+      table.Initialize(numActIn, 2.0);
+      unsigned *d_keyvalues = nullptr;
+      cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
+      unsigned *d_values = d_keyvalues + numActIn;
+      prepareSubMHashKernel<Index, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesIn, d_keyvalues, d_values, outSpatialShape);
+      TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
+      bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues), 
+                reinterpret_cast<unsigned*>(d_values));
+      TV_ASSERT_RT_ERR(res, "err");
+      cudaFree(d_keyvalues);
+      auto tableSize = table.get_table_size();
+      auto tableData = table.data();
+      auto constants = table.get_constants_4();
+      auto stash_constants = table.get_stash_constants();
+      auto stash_count = table.get_stash_count();
+      getSubMIndicePairsHashKernel<Index, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesIn, indicePairs, indiceNum,
+                          kernelSize, stride, padding, dilation, outSpatialShape,
+                          tableSize, tableData, constants, stash_constants,
+                          stash_count);
+      TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
+    }else{
+      prepareSubMGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+      TV_CHECK_CUDA_ERR();
+      getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+            d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                          kernelSize, stride, padding, dilation, outSpatialShape);
+      TV_CHECK_CUDA_ERR();
+    }
    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
-    if (resetGrid) {
+    if (resetGrid && (!useHash)) {
      resetGridSubMKernel<Index, IndexGrid, NDim>
          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);

--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
@@ -464,7 +464,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {

 DECLARE_GPU_SPECS(float);
 DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
+// DECLARE_GPU_SPECS(at::Half); // currently have problem

 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_T_INDEX

--- a/src/spconv/pillar_scatter.cu
+++ b/src/spconv/pillar_scatter.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/pillar_scatter_functor.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+template <typename T, typename Index>
+__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
+                                          tv::TensorView<const T> features,
+                                          tv::TensorView<const T> coors) {
+  auto numFeatures = features.dim(0);
+  auto numPoints = features.dim(1);
+  for (int i : tv::KernelLoopX<int>(numPoints)) {
+    for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
+      canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
+          features(ifeature, i);
+    }
+  }
+}
+namespace functor {
+template <typename T, typename Index>
+struct PointPillarScatter<tv::GPU, T, Index> {
+  void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const T> coors) {
+    auto grid = dim3(tv::launch::DivUp(features.dim(1), 32),
+                     tv::launch::DivUp(features.dim(0), 32));
+    pointPillarsScatterKernel<T, Index>
+        <<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
+    TV_CHECK_CUDA_ERR();
+  }
+};
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::PointPillarScatter<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv
\ No newline at end of file
--- a/src/utils/all.cc
+++ b/src/utils/all.cc
 // Copyright 2019 Yan Yan
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <spconv/box_iou.h>
 #include <spconv/nms.h>
 #include <spconv/point2voxel.h>
-#include <spconv/box_iou.h>
 namespace py = pybind11;
 using namespace pybind11::literals;

-PYBIND11_MODULE(spconv_utils, m)
-{
-    m.doc() = "util pybind11 functions for spconv";
-    m.def("non_max_suppression", &spconv::non_max_suppression<double>, py::return_value_policy::reference_internal, "bbox iou",
-          "boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-    m.def("non_max_suppression", &spconv::non_max_suppression<float>, py::return_value_policy::reference_internal, "bbox iou",
-          "boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-    m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
-          "boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-    m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
-          "boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-    m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
-          "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
-    m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
-          "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
-    m.def("rbbox_iou", &spconv::rbbox_iou<double>,
-          py::return_value_policy::reference_internal, "rbbox iou",
-          "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-          "standup_thresh"_a = 4);
-    m.def("rbbox_iou", &spconv::rbbox_iou<float>,
-          py::return_value_policy::reference_internal, "rbbox iou",
-          "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-          "standup_thresh"_a = 4);
-    m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
-          py::return_value_policy::reference_internal, "rbbox iou",
-          "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-          "standup_thresh"_a = 4);
-    m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
-          py::return_value_policy::reference_internal, "rbbox iou",
-          "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-          "standup_thresh"_a = 4);
-    m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
-          "num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
-          "voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
-          "max_voxels"_a = 9);
-    m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
-          "num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
-          "voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
-          "max_voxels"_a = 9);
-    m.def("points_to_voxel_3d_np_mean", &spconv::points_to_voxel_3d_np_mean<float, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "means"_a = 3, "coors"_a = 4,
-          "num_points_per_voxel"_a = 5, "coor_to_voxelidx"_a = 6,
-          "voxel_size"_a = 7, "coors_range"_a = 8, "max_points"_a = 9,
-          "max_voxels"_a = 10);
-    m.def("points_to_voxel_3d_np_mean", &spconv::points_to_voxel_3d_np_mean<double, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "means"_a = 3, "coors"_a = 4,
-          "num_points_per_voxel"_a = 5, "coor_to_voxelidx"_a = 6,
-          "voxel_size"_a = 7, "coors_range"_a = 8, "max_points"_a = 9,
-          "max_voxels"_a = 10);
-    m.def("points_to_voxel_3d_np_height", &spconv::points_to_voxel_3d_np_height<double, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "heights"_a = 3,
-          "maxs"_a = 4, "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-          "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-          "max_voxels"_a = 11);
-    m.def("points_to_voxel_3d_with_filtering", &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "voxel_mask"_a = 3, "mins"_a = 4,
-          "maxs"_a = 5, "coors"_a = 6, "num_points_per_voxel"_a = 7, "coor_to_voxelidx"_a = 8,
-          "voxel_size"_a = 9, "coors_range"_a = 10, "max_points"_a = 11,
-          "max_voxels"_a = 12, "block_factor"_a = 13, "block_size"_a = 14, "height_threshold"_a = 15);
-    m.def("points_to_voxel_3d_with_filtering", &spconv::points_to_voxel_3d_with_filtering<double, 3>,
-          "matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "voxel_mask"_a = 3, "mins"_a = 4,
-          "maxs"_a = 5, "coors"_a = 6, "num_points_per_voxel"_a = 7, "coor_to_voxelidx"_a = 8,
-          "voxel_size"_a = 9, "coors_range"_a = 10, "max_points"_a = 11,
-          "max_voxels"_a = 12, "block_factor"_a = 13, "block_size"_a = 14, "height_threshold"_a = 15);
+PYBIND11_MODULE(spconv_utils, m) {
+  m.doc() = "util pybind11 functions for spconv";
+  m.def("non_max_suppression", &spconv::non_max_suppression<double>,
+        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
+        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
+  m.def("non_max_suppression", &spconv::non_max_suppression<float>,
+        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
+        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
+  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
+        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
+        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
+  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
+        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
+        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
+  m.def("rotate_non_max_suppression_cpu",
+        &spconv::rotate_non_max_suppression_cpu<float>,
+        py::return_value_policy::reference_internal, "bbox iou",
+        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
+        "thresh"_a = 4);
+  m.def("rotate_non_max_suppression_cpu",
+        &spconv::rotate_non_max_suppression_cpu<double>,
+        py::return_value_policy::reference_internal, "bbox iou",
+        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
+        "thresh"_a = 4);
+  m.def("rbbox_iou", &spconv::rbbox_iou<double>,
+        py::return_value_policy::reference_internal, "rbbox iou",
+        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
+        "standup_thresh"_a = 4);
+  m.def("rbbox_iou", &spconv::rbbox_iou<float>,
+        py::return_value_policy::reference_internal, "rbbox iou",
+        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
+        "standup_thresh"_a = 4);
+  m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
+        py::return_value_policy::reference_internal, "rbbox iou",
+        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
+        "standup_thresh"_a = 4);
+  m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
+        py::return_value_policy::reference_internal, "rbbox iou",
+        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
+        "standup_thresh"_a = 4);
+  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
+        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
+        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
+        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
+        "max_points"_a = 9, "max_voxels"_a = 10);
+  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
+        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
+        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
+        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
+        "max_points"_a = 9, "max_voxels"_a = 10);
+  m.def("points_to_voxel_3d_np_mean",
+        &spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
+        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
+        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
+        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
+        "max_voxels"_a = 11);
+  m.def("points_to_voxel_3d_np_mean",
+        &spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
+        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
+        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
+        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
+        "max_voxels"_a = 11);
+  m.def("points_to_voxel_3d_with_filtering",
+        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
+        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
+        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
+        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
+        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
+        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
+        "block_size"_a = 15, "height_threshold"_a = 16,
+        "height_high_threshold"_a = 17);
+  m.def("points_to_voxel_3d_with_filtering",
+        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
+        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
+        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
+        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
+        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
+        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
+        "block_size"_a = 15, "height_threshold"_a = 16,
+        "height_high_threshold"_a = 17);
 }
\ No newline at end of file