Merge branch 'develop'

a6abf55d · yan.yan · fad30002 · 79a3eaf2 · a6abf55d · a6abf55d
Commit a6abf55d authored Oct 20, 2021 by yan.yan
20 changed files
--- a/spconv/tables.py
+++ b/spconv/tables.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 from torch.autograd import Function

-import spconv
+import spconv.pytorch as spconv
 #from torch.nn import Module
-from spconv.modules import SparseModule
+from spconv.pytorch.modules import SparseModule


 class JoinTable(SparseModule):  # Module):

--- a/spconv/test_utils.py
+++ b/spconv/test_utils.py
-# Copyright 2019 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
-# Copyright 2019 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,282 +13,13 @@
 # limitations under the License.

 import numpy as np
-
-from spconv import spconv_utils
-from spconv.spconv_utils import (non_max_suppression_cpu,
-                                 points_to_voxel_3d_np,
-                                 points_to_voxel_3d_np_mean,
-                                 points_to_voxel_3d_with_filtering,
-                                 rbbox_intersection, rbbox_iou,
-                                 rotate_non_max_suppression_cpu)
-
-try:
-    from spconv.spconv_utils import non_max_suppression
-except ImportError:
-    pass
-
-
-def points_to_voxel(points,
-                    voxel_size,
-                    coors_range,
-                    coor_to_voxelidx,
-                    max_points=35,
-                    max_voxels=20000,
-                    full_mean=False,
-                    block_filtering=True,
-                    block_factor=1,
-                    block_size=8,
-                    height_threshold=0.2,
-                    height_high_threshold=3.0,
-                    pad_output=False):
-    """convert 3d points(N, >=3) to voxels. This version calculate
-    everything in one loop. now it takes only 0.8ms(~6k voxels) 
-    with c++ and 3.2ghz cpu.
-
-    Args:
-        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
-        coors_range: [6] list/tuple or array, float. indicate voxel range.
-            format: xyzxyz, minmax
-        coor_to_voxelidx: int array. used as a dense map.
-        max_points: int. indicate maximum points contained in a voxel.
-        max_voxels: int. indicate maximum voxels this function create.
-            for voxelnet, 20000 is a good choice. you should shuffle points
-            before call this function because max_voxels may drop some points.
-        full_mean: bool. if true, all empty points in voxel will be filled with mean
-            of exist points.
-        block_filtering: filter voxels by height. used for lidar point cloud.
-            use some visualization tool to see filtered result.
-    Returns:
-        voxels: [M, max_points, ndim] float tensor. only contain points.
-        coordinates: [M, 3] int32 tensor. zyx format.
-        num_points_per_voxel: [M] int32 tensor.
-    """
-    if full_mean:
-        assert block_filtering is False
-    if not isinstance(voxel_size, np.ndarray):
-        voxel_size = np.array(voxel_size, dtype=points.dtype)
-    if not isinstance(coors_range, np.ndarray):
-        coors_range = np.array(coors_range, dtype=points.dtype)
-    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
-    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
-    voxelmap_shape = voxelmap_shape[::-1]
-    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
-    voxels = np.zeros(shape=(max_voxels, max_points, points.shape[-1]),
-                      dtype=points.dtype)
-    voxel_point_mask = np.zeros(shape=(max_voxels, max_points),
-                                dtype=points.dtype)
-    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
-    res = {
-        "voxels": voxels,
-        "coordinates": coors,
-        "num_points_per_voxel": num_points_per_voxel,
-        "voxel_point_mask": voxel_point_mask,
-    }
-    if full_mean:
-        means = np.zeros(shape=(max_voxels, points.shape[-1]),
-                         dtype=points.dtype)
-        voxel_num = points_to_voxel_3d_np_mean(points, voxels,
-                                               voxel_point_mask, means, coors,
-                                               num_points_per_voxel,
-                                               coor_to_voxelidx,
-                                               voxel_size.tolist(),
-                                               coors_range.tolist(),
-                                               max_points, max_voxels)
-    else:
-        if block_filtering:
-            block_shape = [*voxelmap_shape[1:]]
-            block_shape = [b // block_factor for b in block_shape]
-            mins = np.full(block_shape, 99999999, dtype=points.dtype)
-            maxs = np.full(block_shape, -99999999, dtype=points.dtype)
-            voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
-            voxel_num = points_to_voxel_3d_with_filtering(
-                points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
-                coors, num_points_per_voxel, coor_to_voxelidx,
-                voxel_size.tolist(), coors_range.tolist(), max_points,
-                max_voxels, block_factor, block_size, height_threshold,
-                height_high_threshold)
-            voxel_mask = voxel_mask.astype(np.bool_)
-            coors_ = coors[voxel_mask]
-            if pad_output:
-                res["coordinates"][:voxel_num] = coors_
-                res["voxels"][:voxel_num] = voxels[voxel_mask]
-                res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
-                    voxel_mask]
-
-                res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
-                    voxel_mask]
-                res["coordinates"][voxel_num:] = 0
-                res["voxels"][voxel_num:] = 0
-                res["num_points_per_voxel"][voxel_num:] = 0
-                res["voxel_point_mask"][voxel_num:] = 0
-            else:
-                res["coordinates"] = coors_
-                res["voxels"] = voxels[voxel_mask]
-                res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
-                res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
-            voxel_num = coors_.shape[0]
-        else:
-            voxel_num = points_to_voxel_3d_np(points, voxels, voxel_point_mask,
-                                              coors, num_points_per_voxel,
-                                              coor_to_voxelidx,
-                                              voxel_size.tolist(),
-                                              coors_range.tolist(), max_points,
-                                              max_voxels)
-    res["voxel_num"] = voxel_num
-    res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
-        -1, max_points, 1)
-    return res
-
-
-class VoxelGenerator:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 full_mean=True):
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
-        voxelmap_shape = voxelmap_shape[::-1]
-
-        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-        self._full_mean = full_mean
-
-    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean)
-        voxels = res["voxels"]
-        coors = res["coordinates"]
-        num_points_per_voxel = res["num_points_per_voxel"]
-        voxel_num = res["voxel_num"]
-        coors = coors[:voxel_num]
-        voxels = voxels[:voxel_num]
-        num_points_per_voxel = num_points_per_voxel[:voxel_num]
-
-        return (voxels, coors, num_points_per_voxel)
-
-    def generate_multi_gpu(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean)
-        voxels = res["voxels"]
-        coors = res["coordinates"]
-        num_points_per_voxel = res["num_points_per_voxel"]
-        voxel_num = res["voxel_num"]
-        return (voxels, coors, num_points_per_voxel)
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-
-class VoxelGeneratorV2:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 full_mean=False,
-                 block_filtering=False,
-                 block_factor=8,
-                 block_size=3,
-                 height_threshold=0.1,
-                 height_high_threshold=2.0):
-        assert full_mean is False, "don't use this."
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-        if block_filtering:
-            assert block_size > 0
-            assert grid_size[0] % block_factor == 0
-            assert grid_size[1] % block_factor == 0
-
-        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
-        voxelmap_shape = voxelmap_shape[::-1]
-        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-        self._full_mean = full_mean
-        self._block_filtering = block_filtering
-        self._block_factor = block_factor
-        self._height_threshold = height_threshold
-        self._block_size = block_size
-        self._height_high_threshold = height_high_threshold
-
-    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean,
-                              self._block_filtering, self._block_factor,
-                              self._block_size, self._height_threshold,
-                              self._height_high_threshold)
-        for k, v in res.items():
-            if k != "voxel_num":
-                res[k] = v[:res["voxel_num"]]
-        return res
-
-    def generate_multi_gpu(self, points, max_voxels=None):
-        res = points_to_voxel(points,
-                              self._voxel_size,
-                              self._point_cloud_range,
-                              self._coor_to_voxelidx,
-                              self._max_num_points,
-                              max_voxels or self._max_voxels,
-                              self._full_mean,
-                              self._block_filtering,
-                              self._block_factor,
-                              self._block_size,
-                              self._height_threshold,
-                              self._height_high_threshold,
-                              pad_output=True)
-        return res
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
+from cumm import tensorview as tv 
+from spconv.core_cc.csrc.sparse.all.ops1d import Point2Voxel as Point2VoxelGPU1d
+from spconv.core_cc.csrc.sparse.all.ops2d import Point2Voxel as Point2VoxelGPU2d
+from spconv.core_cc.csrc.sparse.all.ops3d import Point2Voxel as Point2VoxelGPU3d
+from spconv.core_cc.csrc.sparse.all.ops4d import Point2Voxel as Point2VoxelGPU4d
+
+from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
+from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
+from spconv.core_cc.csrc.sparse.all.ops_cpu3d import Point2VoxelCPU as Point2VoxelCPU3d
+from spconv.core_cc.csrc.sparse.all.ops_cpu4d import Point2VoxelCPU as Point2VoxelCPU4d
\ No newline at end of file
--- a/src/cuhash/CMakeLists.txt
+++ b/src/cuhash/CMakeLists.txt
-if(WIN32)
-    add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
-else()
-    add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
-endif()
-target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
-set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
-set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
-set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_target_properties(cuhash PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-if(NOT WIN32)
-    set_property(TARGET cuhash PROPERTY POSITION_INDEPENDENT_CODE ON)
-endif()
-target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
-install (TARGETS cuhash DESTINATION lib)
-
-if (SPCONV_BuildTests)
-    add_executable(cuhash_test main.cc)
-    target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
-    set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
-    set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
-    set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
-    install (TARGETS cuhash_test DESTINATION bin)
-endif()
\ No newline at end of file
--- a/src/cuhash/debugging.cpp
+++ b/src/cuhash/debugging.cpp
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.cpp
- *
- * @brief Debugging/statistics/performance utilities for hash tables.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-
-#include <algorithm>
-#include <cstring>
-#include <cuhash/cuda_util.h>
-
-namespace cuhash {
-
-void OutputRetrievalStatistics(const unsigned n_queries,
-                               const unsigned *d_retrieval_probes,
-                               const unsigned n_functions) {
-  unsigned *retrieval_probes = new unsigned[n_queries];
-  CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes, d_retrieval_probes,
-                            sizeof(unsigned) * n_queries,
-                            cudaMemcpyDeviceToHost));
-
-  // Create a histogram showing how many items needed how many probes to be
-  // found.
-  unsigned possible_probes = n_functions + 2;
-  unsigned *histogram = new unsigned[possible_probes];
-  memset(histogram, 0, sizeof(unsigned) * (possible_probes));
-  for (unsigned i = 0; i < n_queries; ++i) {
-    histogram[retrieval_probes[i]]++;
-  }
-
-  // Dump it.
-  char buffer[10000];
-  sprintf(buffer, "Probes for retrieval: ");
-  PrintMessage(buffer);
-  for (unsigned i = 0; i < possible_probes; ++i) {
-    sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
-    PrintMessage(buffer);
-  }
-  delete[] retrieval_probes;
-  delete[] histogram;
-}
-
-void OutputBuildStatistics(const unsigned n,
-                           const unsigned *d_iterations_taken) {
-  // Output how many iterations each thread took until it found an empty slot.
-  unsigned *iterations_taken = new unsigned[n];
-  CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken,
-                            sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
-  std::sort(iterations_taken, iterations_taken + n);
-  unsigned total_iterations = 0;
-  unsigned max_iterations_taken = 0;
-  for (unsigned i = 0; i < n; ++i) {
-    total_iterations += iterations_taken[i];
-    max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
-  }
-
-  unsigned current_value = iterations_taken[0];
-  unsigned count = 1;
-  char buffer[10000];
-  sprintf(buffer, "Iterations taken:\n");
-  for (unsigned i = 1; i < n; ++i) {
-    if (iterations_taken[i] != current_value) {
-      sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
-      current_value = iterations_taken[i];
-      count = 1;
-    } else {
-      count++;
-    }
-  }
-  sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
-  PrintMessage(buffer);
-  sprintf(buffer, "Total iterations: %u", total_iterations);
-  PrintMessage(buffer);
-  sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)",
-          (float)total_iterations / n, iterations_taken[n / 2],
-          iterations_taken[n - 1]);
-  PrintMessage(buffer);
-  delete[] iterations_taken;
-
-  // Print the length of the longest eviction chain.
-  sprintf(buffer, "Max iterations: %u", max_iterations_taken);
-  PrintMessage(buffer);
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/src/cuhash/debugging.cu
+++ b/src/cuhash/debugging.cu
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.cu
- *
- * @brief Debugging/statistics/performance utilities for hash tables.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-#include <cuhash/hash_table.cuh>
-
-#include <algorithm>
-#include <cuhash/cuda_util.h>
-
-namespace cuhash {
-
-//! Debugging function: Takes statistics on the hash functions' distribution.
-/*! Determines:
- *    - How many unique slots each key has.
- *    - How many keys hash into each slot.
- *    - Whether any keys failed to get a full set of slots.
- */
-__global__ void take_hash_function_statistics_kernel(
-    const unsigned *keys, const unsigned n_entries, const unsigned table_size,
-    const uint2 *constants, const unsigned num_functions,
-    unsigned *num_slots_available, unsigned *num_hashing_in, unsigned *failed) {
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-
-  if (thread_index >= n_entries)
-    return;
-  unsigned key = keys[thread_index];
-
-  // Determine all of the locations the key hashes into.
-  // Also count how many keys hash into each location.
-  unsigned locations[kMaxHashFunctions];
-  for (unsigned i = 0; i < num_functions; ++i) {
-    locations[i] = hash_function_inner(constants[i], key) % table_size;
-
-    if (num_hashing_in != NULL) {
-      atomicAdd(num_hashing_in + locations[i], 1);
-    }
-  }
-
-  // Determine whether all of the locations were different.
-  unsigned num_slots = 1;
-  for (unsigned i = 1; i < num_functions; ++i) {
-    bool matched = false;
-    for (unsigned j = 0; j < i; ++j) {
-      if (locations[i] == locations[j]) {
-        matched = true;
-        break;
-      }
-    }
-    if (!matched) {
-      num_slots++;
-    }
-  }
-
-  if (num_slots_available != NULL) {
-    num_slots_available[thread_index] = num_slots;
-  }
-
-  if (failed != NULL && num_slots != num_functions) {
-    *failed = 1;
-  }
-}
-
-void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
-                                const unsigned table_size,
-                                const uint2 *constants,
-                                const unsigned kNumHashFunctions) {
-  char buffer[16000];
-  PrintMessage("Hash function constants: ");
-
-  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
-    sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
-    PrintMessage(buffer);
-  }
-
-  unsigned *d_num_hashing_in = NULL;
-#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_num_hashing_in, sizeof(unsigned) * table_size));
-  CUDA_SAFE_CALL(
-      cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
-#endif
-
-  unsigned *d_num_slots_available = NULL;
-#ifdef COUNT_HOW_MANY_HAVE_CYCLES
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_num_slots_available, sizeof(unsigned) * num_keys));
-#endif
-  uint2 *d_constants = NULL;
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_constants, sizeof(uint2) * kNumHashFunctions));
-  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants,
-                            sizeof(uint2) * kNumHashFunctions,
-                            cudaMemcpyHostToDevice));
-
-  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
-                                         kBlockSize>>>(
-      d_keys, num_keys, table_size, d_constants, kNumHashFunctions,
-      d_num_slots_available, d_num_hashing_in, NULL);
-  CUDA_SAFE_CALL(cudaFree(d_constants));
-
-#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-  unsigned *num_hashing_in = new unsigned[table_size];
-  CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in, d_num_hashing_in,
-                            sizeof(unsigned) * table_size,
-                            cudaMemcpyDeviceToHost));
-
-  /*
-  // Print how many items hash into each slot.
-  // Used to make sure items are spread evenly throughout the table.
-  buffer[0] = '\0';
-  PrintMessage("Num hashing into each: ", true);
-  for (unsigned i = 0; i < table_size; ++i) {
-    sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
-    if (i % 25 == 24) {
-      PrintMessage(buffer, true);
-      buffer[0] = '\0';
-    }
-  }
-  PrintMessage(buffer,true);
-  */
-
-  // Print a histogram of how many items are hashed into each slot.  Shows
-  // if average number of items hashing into each slot is low.
-  std::sort(num_hashing_in, num_hashing_in + table_size);
-  int count = 1;
-  unsigned previous = num_hashing_in[0];
-  sprintf(buffer, "Num items hashing into a slot:\t");
-  PrintMessage(buffer);
-  for (unsigned i = 1; i < table_size; ++i) {
-    if (num_hashing_in[i] != previous) {
-      sprintf(buffer, "\t(%u, %u)", previous, count);
-      PrintMessage(buffer);
-      previous = num_hashing_in[i];
-      count = 1;
-    } else {
-      count++;
-    }
-  }
-  sprintf(buffer, "\t(%u, %u)", previous, count);
-  PrintMessage(buffer);
-
-  delete[] num_hashing_in;
-  CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
-#endif
-
-#ifdef COUNT_HOW_MANY_HAVE_CYCLES
-  unsigned *num_slots_available = new unsigned[num_keys];
-  CUDA_SAFE_CALL(cudaMemcpy(num_slots_available, d_num_slots_available,
-                            sizeof(unsigned) * num_keys,
-                            cudaMemcpyDeviceToHost));
-
-  static const unsigned kHistogramSize = kNumHashFunctions + 1;
-  unsigned *histogram = new unsigned[kHistogramSize];
-  memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
-  for (unsigned i = 0; i < num_keys; ++i) {
-    histogram[num_slots_available[i]]++;
-  }
-
-  sprintf(buffer, "Slots assigned to each key: ");
-  for (unsigned i = 1; i < kHistogramSize; ++i) {
-    sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
-  }
-  PrintMessage(buffer);
-
-  delete[] histogram;
-  delete[] num_slots_available;
-  CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
-#endif
-}
-
-bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
-                           const unsigned *d_keys, const unsigned table_size,
-                           uint2 *constants) {
-  unsigned *d_cycle_exists = NULL;
-  uint2 *d_constants = NULL;
-
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cycle_exists, sizeof(unsigned)));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_constants, sizeof(uint2) * N));
-
-  CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
-  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * N,
-                            cudaMemcpyHostToDevice));
-
-  // Check if all keys were given a full set of N slots by the functions.
-  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
-                                         kBlockSize>>>(
-      d_keys, num_keys, table_size, d_constants, N, NULL, NULL, d_cycle_exists);
-
-  unsigned cycle_exists;
-  CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists, d_cycle_exists, sizeof(unsigned),
-                            cudaMemcpyDeviceToHost));
-
-  CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
-  CUDA_SAFE_CALL(cudaFree(d_constants));
-
-  return (cycle_exists != 0);
-}
-
-void PrintStashContents(const Entry *d_stash) {
-  Entry *stash = new Entry[cuhash::kStashSize];
-  CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize,
-                            cudaMemcpyDeviceToHost));
-  for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
-    if (get_key(stash[i]) != kKeyEmpty) {
-      char buffer[256];
-      sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]),
-              get_value(stash[i]));
-      PrintMessage(buffer, true);
-    }
-  }
-  delete[] stash;
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/src/cuhash/hash_functions.cpp
+++ b/src/cuhash/hash_functions.cpp
-// nvcc (cuda) 9.0 with gcc 5.5 don't support random, so compile it in host
-
-#include <random>
-
-namespace cuhash {
-
-std::random_device random_dev;
-
-std::mt19937 random_engine(random_dev());
-std::uniform_int_distribution<unsigned> uint_distribution;
-
-unsigned generate_random_uint32() { return uint_distribution(random_engine); }
-
-} // namespace cuhash
\ No newline at end of file
--- a/src/cuhash/hash_functions.cu
+++ b/src/cuhash/hash_functions.cu
-#include <cassert>
-#include <cuhash/debugging.h>
-#include <cuhash/hash_functions.h>
-#include <cuhash/hash_table.h>
-
-namespace cuhash {
-
-void GenerateFunctions(const unsigned N, const unsigned num_keys,
-                       const unsigned *d_keys, const unsigned table_size,
-                       uint2 *constants) {
-  bool regenerate = true;
-
-  while (regenerate) {
-    regenerate = false;
-
-    // Generate a set of hash function constants for this build attempt.
-    for (unsigned i = 0; i < N; ++i) {
-      // uint_distribution(random_engine) % kPrimeDivisor;
-      // genrand_int32() % kPrimeDivisor;
-      unsigned new_a = generate_random_uint32() % kPrimeDivisor;
-      constants[i].x = (1 > new_a ? 1 : new_a);
-      constants[i].y = generate_random_uint32() % kPrimeDivisor;
-    }
-
-#ifdef FORCEFULLY_GENERATE_NO_CYCLES
-    // Ensure that every key gets N different slots.
-    regenerate =
-        CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
-#endif
-  }
-
-#ifdef TAKE_HASH_FUNCTION_STATISTICS
-  // Examine how well distributed the items are.
-  TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
-#endif
-}
-
-}; // namespace cuhash
--- a/src/cuhash/hash_table.cpp
+++ b/src/cuhash/hash_table.cpp
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cpp
- *
- * @brief Implements a basic hash table that stores one value per key.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/hash_table.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cuda_runtime_api.h>
-#include <cuhash/cuda_util.h>
-#include <limits>
-
-namespace cuhash {
-
-char buffer[256];
-
-//! @name Internal
-/// @{
-dim3 ComputeGridDim(unsigned n) {
-  // Round up in order to make sure all items are hashed in.
-  dim3 grid((n + kBlockSize - 1) / kBlockSize);
-  if (grid.x > kGridSize) {
-    grid.y = (grid.x + kGridSize - 1) / kGridSize;
-    grid.x = kGridSize;
-  }
-  return grid;
-}
-
-unsigned ComputeMaxIterations(const unsigned n, const unsigned table_size,
-                              const unsigned num_functions) {
-  float lg_input_size = (float)(log((double)n) / log(2.0));
-
-// #define CONSTANT_ITERATIONS
-#ifdef CONSTANT_ITERATIONS
-  // Set the maximum number of iterations to 7lg(N).
-  const unsigned MAX_ITERATION_CONSTANT = 7;
-  unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
-#else
-  // Use an empirical formula for determining what the maximum number of
-  // iterations should be.  Works OK in most situations.
-  float load_factor = float(n) / table_size;
-  float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
-
-  unsigned max_iterations =
-      (unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 * ln_load_factor) *
-                            lg_input_size));
-#endif
-  return max_iterations;
-}
-/// @}
-
-HashTable::HashTable()
-    : table_size_(0), d_contents_(NULL), stash_count_(0), d_failures_(NULL) {
-  CUDA_CHECK_ERROR("Failed in constructor.\n");
-}
-
-bool HashTable::Initialize(const unsigned max_table_entries,
-                           const float space_usage,
-                           const unsigned num_functions) {
-  Release();
-
-  // Determine the minimum amount of slots the table requires,
-  // and whether the space_usage is within range.
-  float minimum_space_usage;
-  if (num_functions < 2 || num_functions > 5) {
-    char message[256] = "Number of hash functions must be from 2 to 5; "
-                        "others are unimplemented.";
-    PrintMessage(message, true);
-    return false;
-  } else {
-    minimum_space_usage = kMinimumSpaceUsages[num_functions];
-  }
-
-  if (space_usage < minimum_space_usage) {
-    sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
-            num_functions, minimum_space_usage);
-    PrintMessage(buffer);
-    return false;
-  }
-
-  num_hash_functions_ = num_functions;
-  table_size_ = unsigned(ceil(max_table_entries * space_usage));
-
-  // Allocate memory.
-  const unsigned slots_to_allocate = table_size_ + kStashSize;
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_contents_, sizeof(Entry) * slots_to_allocate));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_failures_, sizeof(unsigned)));
-  if (!d_contents_ || !d_failures_) {
-    fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
-    return false;
-  }
-  CUDA_CHECK_ERROR("Failed to initialize.\n");
-
-  return true;
-}
-
-void HashTable::Release() {
-  table_size_ = 0;
-
-  CUDA_SAFE_CALL(cudaFree(d_contents_));
-  CUDA_SAFE_CALL(cudaFree(d_failures_));
-
-  d_contents_ = NULL;
-  d_failures_ = NULL;
-
-  CUDA_CHECK_ERROR("Failed during release.\n");
-}
-
-bool HashTable::Build(const unsigned n, const unsigned *d_keys,
-                      const unsigned *d_values) {
-  unsigned max_iterations =
-      ComputeMaxIterations(n, table_size_, num_hash_functions_);
-  unsigned num_failures = 1;
-  unsigned num_attempts = 0;
-
-  // Storage for statistics collection.
-  unsigned *d_iterations_taken = NULL;
-#ifdef TRACK_ITERATIONS
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_iterations_taken, sizeof(unsigned) * n));
-#endif
-
-  // Track how many items ended up in the stash.
-  unsigned *d_stash_count = NULL;
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_stash_count, sizeof(unsigned)));
-  CUDA_CHECK_ERROR("Failed before main build loop.\n");
-
-  // Main build loop.
-  while (num_failures && ++num_attempts < kMaxRestartAttempts) {
-    CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
-
-    // Generate new hash functions.
-    if (num_hash_functions_ == 2)
-      constants_2_.Generate(n, d_keys, table_size_);
-    else if (num_hash_functions_ == 3)
-      constants_3_.Generate(n, d_keys, table_size_);
-    else if (num_hash_functions_ == 4)
-      constants_4_.Generate(n, d_keys, table_size_);
-    else
-      constants_5_.Generate(n, d_keys, table_size_);
-
-    stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
-    stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
-    stash_count_ = 0;
-
-    // Initialize memory.
-    unsigned slots_in_table = table_size_ + kStashSize;
-    CUDAWrapper::ClearTable(slots_in_table, kEntryEmpty, d_contents_);
-
-    num_failures = 0;
-
-    CUDAWrapper::CallCuckooHash(
-        n, num_hash_functions_, d_keys, d_values, table_size_, constants_2_,
-        constants_3_, constants_4_, constants_5_, max_iterations, d_contents_,
-        stash_constants_, d_stash_count, d_failures_, d_iterations_taken);
-
-    // Check if successful.
-    CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned),
-                              cudaMemcpyDeviceToHost));
-
-#ifdef COUNT_UNINSERTED
-    if (num_failures) {
-      printf("Failed to insert %u items.\n", num_failures);
-    }
-#endif
-  }
-
-  // Copy out the stash size.
-  CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned),
-                            cudaMemcpyDeviceToHost));
-  if (stash_count_ && num_failures == 0) {
-    // sprintf(buffer, "Stash size: %u", stash_count_);
-    // PrintMessage(buffer, true);
-
-#ifdef _DEBUG
-    PrintStashContents(d_contents_ + table_size_);
-#endif
-  }
-  CUDA_SAFE_CALL(cudaFree(d_stash_count));
-
-#ifdef TRACK_ITERATIONS
-  if (num_failures == 0) {
-    OutputBuildStatistics(n, d_iterations_taken);
-  }
-  CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
-#endif
-
-  // Dump some info if a restart was required.
-  if (num_attempts >= kMaxRestartAttempts) {
-    sprintf(buffer, "Completely failed to build");
-    PrintMessage(buffer, true);
-  } else if (num_attempts > 1) {
-    sprintf(buffer, "Needed %u attempts to build, you can ignore this message.",
-            num_attempts);
-    PrintMessage(buffer, true);
-  }
-
-  CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
-  return num_failures == 0;
-}
-
-void HashTable::Retrieve(const unsigned n_queries, const unsigned *d_keys,
-                         unsigned *d_values) {
-  CUDAWrapper::CallHashRetrieve(n_queries, num_hash_functions_, d_keys,
-                                table_size_, d_contents_, constants_2_,
-                                constants_3_, constants_4_, constants_5_,
-                                stash_constants_, stash_count_, d_values);
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/src/cuhash/hash_table.cu
+++ b/src/cuhash/hash_table.cu
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cu
- *
- * @brief Hides all of the CUDA calls from the actual CPP file.
- */
-
-#include <cuhash/cuda_util.h>
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-#include <cuhash/hash_table.cuh>
-
-#include <cuda.h>
-
-namespace cuhash {
-
-namespace CUDAWrapper {
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
-                Entry *d_contents) {
-  clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>(
-      slots_in_table, fill_value, d_contents);
-  TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
-}
-
-void CallCuckooHash(const unsigned n, const unsigned num_hash_functions,
-                    const unsigned *d_keys, const unsigned *d_values,
-                    const unsigned table_size, const Functions<2> constants_2,
-                    const Functions<3> constants_3,
-                    const Functions<4> constants_4,
-                    const Functions<5> constants_5,
-                    const unsigned max_iterations, Entry *d_contents,
-                    uint2 stash_constants, unsigned *d_stash_count,
-                    unsigned *d_failures, unsigned *d_iterations_taken) {
-  // Build the table.
-  cudaMemset(d_failures, 0, sizeof(unsigned));
-  if (num_hash_functions == 2) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_2, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else if (num_hash_functions == 3) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_3, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else if (num_hash_functions == 4) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_4, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_5, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  }
-
-  CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
-}
-
-void CallHashRetrieve(const unsigned n_queries,
-                      const unsigned num_hash_functions, const unsigned *d_keys,
-                      const unsigned table_size, const Entry *d_contents,
-                      const Functions<2> constants_2,
-                      const Functions<3> constants_3,
-                      const Functions<4> constants_4,
-                      const Functions<5> constants_5,
-                      const uint2 stash_constants, const unsigned stash_count,
-                      unsigned *d_values) {
-  unsigned *d_retrieval_probes = NULL;
-#ifdef TRACK_ITERATIONS
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_retrieval_probes, sizeof(unsigned) * n_queries));
-#endif
-
-  if (num_hash_functions == 2) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_2, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else if (num_hash_functions == 3) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_3, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else if (num_hash_functions == 4) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_4, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_5, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  }
-
-  CUDA_CHECK_ERROR("Retrieval failed.\n");
-
-#ifdef TRACK_ITERATIONS
-  OutputRetrievalStatistics(n_queries, d_retrieval_probes, num_hash_functions);
-  CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
-#endif
-}
-}; // namespace CUDAWrapper
-
-}; // namespace cuhash
--- a/src/cuhash/main.cc
+++ b/src/cuhash/main.cc
-#include <cuda.h>
-#include <cuhash/hash_table.h>
-
-int main() {
-  auto table = cuhash::HashTable();
-  table.Initialize(10, 2.0);
-  const int N = 10;
-
-  // ハッシュテーブルに格納するデータ
-  int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
-  int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  // デバイスメモリにコピー
-  int *d_keys, *d_vals;
-  cudaMalloc((void **)&d_keys, sizeof(int) * N);
-  cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_vals, sizeof(int) * N);
-  cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // ハッシュテーブルにクエリするデータ
-  int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[N];
-
-  // デバイスメモリにコピー
-  int *d_input, *d_output;
-  cudaMalloc((void **)&d_input, sizeof(int) * N);
-  cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_output, sizeof(int) * N);
-  cudaMemset(d_output, 0, sizeof(int) * N);
-  bool s = table.Build(N, (const unsigned int *)d_keys,
-                       (const unsigned int *)d_vals);
-
-  std::cout << s << std::endl;
-  table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
-
-  std::cout << s << std::endl;
-  cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  for (int i = 0; i < N; ++i) {
-    printf("%d\n", output[i]);
-  }
-
-  return 0;
-}
\ No newline at end of file
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
-set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
-if (SPCONV_BuildCUDA)
-    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
-endif()
-add_library(spconv SHARED ${ALL_FILES})
-
-find_package(OpenMP)
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(spconv PUBLIC OpenMP::OpenMP_CXX)
-endif()
-
-
-target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
-set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
-set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
-set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash)
-else()
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS})
-endif()
-install (TARGETS spconv DESTINATION lib)
--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/fused_spconv_ops.h>
-#include <spconv/nms_ops.h>
-#include <spconv/pillar_scatter_ops.h>
-#include <spconv/pool_ops.h>
-#include <spconv/spconv_ops.h>
-#include <torch/script.h>
-
-static auto registry =
-    torch::RegisterOperators()
-        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
-        .op("spconv::indice_conv", &spconv::indiceConv)
-        .op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
-        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
-        .op("spconv::indice_maxpool", &spconv::indiceMaxPool)
-        .op("spconv::indice_maxpool_backward", &spconv::indiceMaxPoolBackward)
-        .op("spconv::nms", &spconv::nonMaxSuppression<float>)
-        .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
-        .op("spconv::pillar_scatter_half",
-            &spconv::pointPillarScatter<at::Half>);
--- a/src/spconv/cublas_gemm.cc
+++ b/src/spconv/cublas_gemm.cc
-#include <ATen/ATen.h>
-#include <spconv/cublas_gemm.h>
-
-namespace spconv {
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const float *alpha, const float *A, int lda,
-                           const float *B, int ldb, const float *beta, float *C,
-                           int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const __half *alpha, const __half *A, int lda,
-                           const __half *B, int ldb, const __half *beta,
-                           __half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const at::Half *alpha, const at::Half *A, int lda,
-                           const at::Half *B, int ldb, const at::Half *beta,
-                           at::Half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k,
-                     reinterpret_cast<const __half *>(alpha),
-                     reinterpret_cast<const __half *>(A), lda,
-                     reinterpret_cast<const __half *>(B), ldb,
-                     reinterpret_cast<const __half *>(beta),
-                     reinterpret_cast<__half *>(C), ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const double *alpha, const double *A, int lda,
-                           const double *B, int ldb, const double *beta,
-                           double *C, int ldc) {
-  return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/indice.cc
+++ b/src/spconv/indice.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/Parallel.h>
-#include <spconv/geometry.h>
-#include <spconv/indice.h>
-#include <spconv/spconv_ops.h>
-#include <tensorview/tensor.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<Index> indicesOut,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *kernelSize, const Index *stride,
-                         const Index *padding, const Index *dilation,
-                         const Index *outSpatialShape) {
-  // indicesOut: num_active * kernelVolume * (NDim + 1)
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index hashval;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      auto iter = hash.find(index);
-      if (iter == hash.end()) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        hashval = numAct++;
-        hash[index] = hashval;
-      } else {
-        hashval = iter->second;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(0, offset, indiceNum[offset]) = j;
-      indicePairs(1, offset, indiceNum[offset]++) = hashval;
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
-                           tv::TensorView<Index> indicesOut,
-                           tv::TensorView<IndexGrid> gridsOut,
-                           tv::TensorView<Index> indicePairs,
-                           tv::TensorView<Index> indiceNum,
-                           const Index *kernelSize, const Index *stride,
-                           const Index *padding, const Index *dilation,
-                           const Index *outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index hashval;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-
-      auto iter = hash.find(index);
-      if (iter == hash.end()) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        hashval = numAct++;
-        hash[index] = hashval;
-      } else {
-        hashval = iter->second;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(0, offset, indiceNum[offset]) = j;
-      indicePairs(1, offset, indiceNum[offset]++) = hashval;
-    }
-  }
-  return numAct;
-}
-
-#ifndef TV_WINDOWS
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    Index index = 0;
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    hash[index] = j;
-  }
-
-  at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
-    Index index = 0;
-    Index numValidPoints = 0;
-    std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-    Index *validPoints = validPoints_.data();
-    Index *pointPtr = nullptr;
-    Index oldOffset = 0;
-    for (int j = begin; j < end; ++j) {
-      numValidPoints = getValidOutPos<Index, NDim>(
-          indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-          dilation, outSpatialShape, validPoints);
-      for (Index i = 0; i < numValidPoints; ++i) {
-        pointPtr = validPoints + i * (NDim + 1);
-        auto offset = pointPtr[NDim];
-        index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                spatialVolume * indicesIn(j, 0);
-        auto iter = hash.find(index);
-        if (iter != hash.end()) {
-#pragma omp atomic capture
-          oldOffset = indiceNum[offset]++;
-          indicePairs(0, offset, oldOffset) = j;
-          indicePairs(1, offset, oldOffset) = iter->second;
-        }
-      }
-    }
-  });
-  return numActIn;
-}
-#else
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  // Index validPoints[kernelVolume * (NDim + 1)];
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    Index index = 0;
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    hash[index] = j;
-  }
-  Index index = 0;
-  for (int j = 0; j < numActIn; ++j) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-              spatialVolume * indicesIn(j, 0);
-      auto iter = hash.find(index);
-      if (iter != hash.end()) {
-        indicePairs(0, offset, indiceNum[offset]) = j;
-        indicePairs(1, offset, indiceNum[offset]++) = iter->second;
-      }
-    }
-  }
-  return numActIn;
-}
-#endif
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash) {
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = TV_DECLTYPE(V);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = TV_DECLTYPE(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      if (transpose)
-        numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
-            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
-            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
-            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
-            di.data(), ou.data());
-      else
-        numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
-            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
-            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
-            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
-            di.data(), ou.data());
-    });
-  });
-  return numActIn;
-}
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = TV_DECLTYPE(V);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = TV_DECLTYPE(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
-          tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
-          tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
-          ks.data(), st.data(), pa.data(), di.data(), ou.data());
-    });
-  });
-  return numActIn;
-}
-
-} // namespace spconv
--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <cuhash/hash_table.h>
-#include <limits>
-#include <spconv/indice.cu.h>
-#include <spconv/indice.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <type_traits>
-#include <utility/timer.h>
-
-namespace spconv {
-
-using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
-
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = kernelSize.size();
-  auto numActIn = indicesIn.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = TV_DECLTYPE(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = TV_DECLTYPE(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      tv::DispatchInt<max_kernel_vol_t>()(
-          kernelVolume, std::less_equal<int>(), [&](auto I2) {
-            constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
-            if (transpose) {
-              prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum),
-                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
-                                  pa, di, ou);
-              TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
-            } else {
-              prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum),
-                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
-                                  pa, di, ou);
-              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
-            }
-#ifdef TV_LOG_KERNEL_INFO
-            cudaFuncAttributes attr;
-            checkCudaErrors(cudaFuncGetAttributes(
-                &attr,
-                prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
-            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
-                        MaxKernelVolume, ">", attr.numRegs);
-#endif
-          });
-    });
-  });
-  return 1;
-}
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  int numAct = indicePairUnique.size(0) - 1;
-
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  bool failed = false;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = TV_DECLTYPE(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = TV_DECLTYPE(I)::value;
-      using IndexGrid = int32_t;
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      if (useHash) {
-        auto table = cuhash::HashTable();
-        // std::cout << "create " << numAct << " size table..." << std::endl;
-        table.Initialize(numAct, 2.0, 4);
-        unsigned *d_values = nullptr;
-        cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
-        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-        arangeKernel<unsigned>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(d_values, numAct);
-        TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
-        bool res = table.Build(
-            numAct,
-            reinterpret_cast<unsigned *>(indicePairUnique.data_ptr<Index>()),
-            d_values);
-        cudaFree(d_values);
-        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
-        if (!res) {
-          failed = true;
-          return;
-        }
-        assignIndiceOutKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
-                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
-        TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
-
-        auto tableSize = table.get_table_size();
-        auto tableData = table.data();
-        auto constants = table.get_constants_4();
-        auto stash_constants = table.get_stash_constants();
-        auto stash_count = table.get_stash_count();
-        assignIndicePairsHashKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), tableSize,
-                         tableData, constants, stash_constants, stash_count);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");
-
-      } else {
-        assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
-        TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
-        assignIndicePairsKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numActIn,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), ou);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
-        tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
-                    ">", attr.numRegs);
-        cudaFuncAttributes attr2;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
-        tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
-                    attr2.numRegs);
-#endif
-      }
-
-      if (resetGrid && (!useHash)) {
-        resetGridKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(indicePairUnique.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct);
-        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-      }
-    });
-  });
-  if (failed){
-    return -1;
-  }
-  return numAct;
-}
-
-template <typename T> struct is_valid {
-  __device__ __forceinline__ bool operator()(const T x) { return x != -1; }
-};
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  bool failed = false;
-
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = TV_DECLTYPE(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = TV_DECLTYPE(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      Index spatialVolume = 1;
-      for (int i = 0; i < NDim; ++i) {
-        spatialVolume *= outSpatialShape[i];
-      }
-      if (useHash) {
-        auto table = cuhash::HashTable();
-        // std::cout << "create " << numAct << " size table..." << std::endl;
-        table.Initialize(numActIn, 2.0, 4);
-        unsigned *d_keyvalues = nullptr;
-        cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
-        unsigned *d_values = d_keyvalues + numActIn;
-        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-        prepareSubMHashKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
-                         ou);
-        TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
-        bool res =
-            table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
-                        reinterpret_cast<unsigned *>(d_values));
-        cudaFree(d_keyvalues);
-        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
-        if (!res) {
-          failed = true;
-          return;
-        }
-        auto tableSize = table.get_table_size();
-        auto tableData = table.data();
-        auto constants = table.get_constants_4();
-        auto stash_constants = table.get_stash_constants();
-        auto stash_count = table.get_stash_count();
-        tv::DispatchInt<max_kernel_vol_t>()(
-            kernelVolume, std::less_equal<int>(), [&](auto I2) {
-              constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
-              getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ks, st, pa,
-                                  di, ou, tableSize, tableData, constants,
-                                  stash_constants, stash_count);
-              TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
-            });
-      } else {
-        // auto timer = spconv::CudaContextTimer<>();
-        prepareSubMGridKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
-        // tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
-        TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
-        // when dilation all one, we use a simple kernel to calc result
-        bool dilation_one = true;
-        for (int i = 0; i < NDim; ++i) {
-          dilation_one &= di[i] == 1;
-        }
-        auto found = false;
-        if (dilation_one && (NDim == 2 || NDim == 3)) {
-          auto indiceNumCpu = indiceNum.cpu();
-          if (NDim == 2) {
-            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-
-            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
-              tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
-                constexpr int K0 = TV_DECLTYPE(K0C)::value;
-                constexpr int K1 = TV_DECLTYPE(K1C)::value;
-                found = true;
-                getSubMIndicePairsKernel2<Index, IndexGrid, K0, K1>
-                    <<<tv::cuda::getBlocks(numActIn),
-                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                        tv::torch2tv<Index>(indicesIn),
-                        tv::torch2tv<IndexGrid>(gridsOut),
-                        tv::torch2tv<Index>(indicePairs),
-                        tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
-              });
-            });
-          } else if (NDim == 3) {
-            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-
-            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
-              tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
-                tv::dispatch_int_noexcept<1, 3, 5>(
-                    kernelSize[2], [&](auto K2C) {
-                      constexpr int K0 = TV_DECLTYPE(K0C)::value;
-                      constexpr int K1 = TV_DECLTYPE(K1C)::value;
-                      constexpr int K2 = TV_DECLTYPE(K2C)::value;
-                      found = true;
-                      getSubMIndicePairsKernel3<Index, IndexGrid, K0, K1, K2>
-                          <<<tv::cuda::getBlocks(numActIn),
-                             tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                              tv::torch2tv<Index>(indicesIn),
-                              tv::torch2tv<IndexGrid>(gridsOut),
-                              tv::torch2tv<Index>(indicePairs),
-                              tv::torch2tv<Index>(indiceNum), ou_,
-                              spatialVolume);
-                    });
-              });
-            });
-          }
-        }
-        if (!found) {
-          tv::DispatchInt<
-              max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
-            constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
-            getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
-                <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-                   stream>>>(tv::torch2tv<Index>(indicesIn),
-                             tv::torch2tv<IndexGrid>(gridsOut),
-                             tv::torch2tv<Index>(indicePairs),
-                             tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
-                             ou);
-            TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
-          });
-        }
-        // tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
-      }
-
-      if (resetGrid && (!useHash)) {
-        resetGridSubMKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(indicesIn.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
-        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-      }
-    });
-  });
-  if (failed){
-    return -1;
-  }
-
-  return numActIn;
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/maxpool.cc
+++ b/src/spconv/maxpool.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/maxpool.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size) {
-  if (size <= 0)
-    return;
-  int stride = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      auto outFeaturesData = outFeatures.data_ptr<T>();
-      auto inFeaturesData = inFeatures.data_ptr<T>();
-      auto indicesInData = indicesIn.data_ptr<Index>();
-      auto indicesOutData = indicesOut.data_ptr<Index>();
-      Index idxi, idxo;
-      for (int row = 0; row < size; row++) {
-        idxi = indicesInData[row] * stride;
-        idxo = indicesOutData[row] * stride;
-        for (int plane = 0; plane < stride; ++plane)
-          if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
-            outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
-      }
-    });
-  });
-}
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size) {
-  if (size <= 0)
-    return;
-  int stride = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      auto outFeaturesData = outFeatures.data_ptr<T>();
-      auto inFeaturesData = inFeatures.data_ptr<T>();
-      auto doutData = dout.data_ptr<T>();
-      auto dinData = din.data_ptr<T>();
-      auto indicesInData = indicesIn.data_ptr<Index>();
-      auto indicesOutData = indicesOut.data_ptr<Index>();
-      Index idxi, idxo;
-      for (int row = 0; row < size; row++) {
-        idxi = indicesInData[row] * stride;
-        idxo = indicesOutData[row] * stride;
-        for (int plane = 0; plane < stride; ++plane)
-          if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
-            dinData[idxi + plane] += doutData[idxo + plane];
-      }
-    });
-  });
-}
-
-} // namespace spconv
--- a/src/spconv/maxpool.cu
+++ b/src/spconv/maxpool.cu
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/maxpool.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  T in, out;
-  int ILPStrideY[NumILP];
-  Index idxo, idxi;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in > out) {
-          outFeatures[idxo] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
-                             const Index *indicesIn, const Index *indicesOut,
-                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in > out) {
-          outFeatures[RO[ilp] + iy] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] > bufo[i]) {
-          bufo[i] = bufi[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idxo] =
-          reinterpret_cast<VecType *>(bufo)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in > out) {
-            outFeatures[RO[ilp] + iy] = in;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
-                      T *din, const Index *indicesIn, const Index *indicesOut,
-                      int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  T in, out;
-  Index idxo, idxi;
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  dout += blockIdx.y * NumTLP;
-  din += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in == out) {
-          din[idxi] += dout[idxo];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
-                                             const T *inFeatures, const T *dout,
-                                             T *din, const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in == out) {
-          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
-                         const T *dout, T *din, const Index *indicesIn,
-                         const Index *indicesOut, int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  T bufdi[vecloadFactor];
-  T bufdo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<const VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-      reinterpret_cast<VecType *>(bufdo)[0] =
-          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] =
-          reinterpret_cast<VecType *>(din)[idxi];
-
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] == bufo[i]) {
-          bufdi[i] += bufdo[i];
-        }
-      }
-      reinterpret_cast<VecType *>(din)[idxi] =
-          reinterpret_cast<VecType *>(bufdi)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
-                        const T *dout, T *din, const Index *indicesIn,
-                        const Index *indicesOut, int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in == out) {
-            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-          }
-        }
-      }
-    }
-  }
-}
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    using vecload_type_t =
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indicesIn,
-                                       &indicesOut, &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (numHotBlock >= NumTLP) {
-              maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                     stream>>>(
-                      outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                      indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                      numHotBlock, numPlanes / vecloadFactor);
-              TV_CHECK_CUDA_ERR();
-            }
-
-            if (size > numHotBlock) {
-              maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  inFeatures.data_ptr<T>(),
-                                  indicesIn.data_ptr<Index>() + numHotBlock,
-                                  indicesOut.data_ptr<Index>() + numHotBlock,
-                                  size - numHotBlock, numPlanes);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (numHotBlock >= NumTLP) {
-          maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                  numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-
-        if (size > numHotBlock) {
-          maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>() + numHotBlock,
-                  indicesOut.data_ptr<Index>() + numHotBlock,
-                  size - numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-      }
-    });
-  });
-}
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = TV_DECLTYPE(TValue);
-    using vecload_type_t =
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = TV_DECLTYPE(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout,
-                                       &din, &indicesIn, &indicesOut,
-                                       &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (numHotBlock >= NumTLP) {
-              maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                     stream>>>(outFeatures.data_ptr<T>(),
-                               inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
-                               din.data_ptr<T>(), indicesIn.data_ptr<Index>(),
-                               indicesOut.data_ptr<Index>(), numHotBlock,
-                               numPlanes / vecloadFactor);
-              TV_CHECK_CUDA_ERR();
-            }
-
-            if (size > numHotBlock) {
-              maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
-                                  din.data_ptr<T>(),
-                                  indicesIn.data_ptr<Index>() + numHotBlock,
-                                  indicesOut.data_ptr<Index>() + numHotBlock,
-                                  size - numHotBlock, numPlanes);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (numHotBlock >= NumTLP) {
-          maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  dout.data_ptr<T>(), din.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                  numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-
-        if (size > numHotBlock) {
-          maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  dout.data_ptr<T>(), din.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>() + numHotBlock,
-                  indicesOut.data_ptr<Index>() + numHotBlock,
-                  size - numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-      }
-    });
-  });
-}
-
-} // namespace spconv
\ No newline at end of file
--- a/src/spconv/nms.cc
+++ b/src/spconv/nms.cc
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <boost/geometry.hpp>
-#include <spconv/nms_functor.h>
-#include <torch/script.h>
-#include <vector>
-
-namespace spconv {
-
-namespace functor {
-template <typename T, typename Index>
-struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
-  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps) {
-    auto ndets = boxes.dim(0);
-    auto suppressed = std::vector<Index>(ndets);
-    auto area = std::vector<T>(ndets);
-    for (int i = 0; i < ndets; ++i) {
-      area[i] =
-          (boxes(i, 2) - boxes(i, 0) + eps) * (boxes(i, 3) - boxes(i, 1) + eps);
-    }
-    int i, j;
-    T xx1, xx2, w, h, inter, ovr;
-    int keepNum = 0;
-    for (int _i = 0; _i < ndets; ++_i) {
-      i = _i;
-      if (suppressed[i] == 1)
-        continue;
-      keep[keepNum] = i;
-      keepNum += 1;
-      for (int _j = _i + 1; _j < ndets; ++_j) {
-        j = _j;
-        if (suppressed[j] == 1)
-          continue;
-        xx2 = std::min(boxes(i, 2), boxes(j, 2));
-        xx1 = std::max(boxes(i, 0), boxes(j, 0));
-        w = xx2 - xx1 + eps;
-        if (w > 0) {
-          xx2 = std::min(boxes(i, 3), boxes(j, 3));
-          xx1 = std::max(boxes(i, 1), boxes(j, 1));
-          h = xx2 - xx1 + eps;
-          if (h > 0) {
-            inter = w * h;
-            ovr = inter / (area[i] + area[j] - inter);
-            if (ovr >= threshold)
-              suppressed[j] = 1;
-          }
-        }
-      }
-    }
-    return keepNum;
-  }
-};
-
-template <typename T, typename Index>
-struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
-  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold) {
-    auto ndets = boxCorners.dim(0);
-    auto suppressed = std::vector<Index>(ndets);
-    int i, j;
-    namespace bg = boost::geometry;
-    typedef bg::model::point<T, 2, bg::cs::cartesian> point_t;
-    typedef bg::model::polygon<point_t> polygon_t;
-    polygon_t poly, qpoly;
-    std::vector<polygon_t> poly_inter, poly_union;
-    T inter_area, union_area, overlap;
-    int keepNum = 0;
-    for (int _i = 0; _i < ndets; ++_i) {
-      i = _i;
-      if (suppressed[i] == 1)
-        continue;
-      keep[keepNum] = i;
-      keepNum += 1;
-      for (int _j = _i + 1; _j < ndets; ++_j) {
-        j = _j;
-        if (suppressed[j] == 1)
-          continue;
-        if (standupIoU(i, j) <= 0.0)
-          continue;
-        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
-        bg::append(poly, point_t(boxCorners(i, 1, 0), boxCorners(i, 1, 1)));
-        bg::append(poly, point_t(boxCorners(i, 2, 0), boxCorners(i, 2, 1)));
-        bg::append(poly, point_t(boxCorners(i, 3, 0), boxCorners(i, 3, 1)));
-        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 1, 0), boxCorners(j, 1, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 2, 0), boxCorners(j, 2, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 3, 0), boxCorners(j, 3, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-
-        if (!poly_inter.empty()) {
-          inter_area = bg::area(poly_inter.front());
-          bg::union_(poly, qpoly, poly_union);
-          if (!poly_union.empty()) { // ignore invalid box
-            union_area = bg::area(poly_union.front());
-            overlap = inter_area / union_area;
-            if (overlap >= threshold)
-              suppressed[j] = 1;
-            poly_union.clear();
-          }
-        }
-        poly.clear();
-        qpoly.clear();
-        poly_inter.clear();
-      }
-    }
-    return keepNum;
-  }
-};
-
-} // namespace functor
-
-#define DECLARE_CPU_T_INDEX(T, Index)                                          \
-  template struct functor::NonMaxSupressionFunctor<tv::CPU, T, Index>;         \
-  template struct functor::rotateNonMaxSupressionFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_INDEX(Index)                                               \
-  DECLARE_CPU_T_INDEX(float, Index);                                           \
-  DECLARE_CPU_T_INDEX(double, Index);
-
-DECLARE_CPU_INDEX(int);
-DECLARE_CPU_INDEX(long);
-
-#undef DECLARE_CPU_INDEX
-#undef DECLARE_CPU_T_INDEX
-
-} // namespace spconv
--- a/src/spconv/nms.cu
+++ b/src/spconv/nms.cu
-// ------------------------------------------------------------------
-// Deformable Convolutional Networks
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN
-// (https://github.com/shaoqingren/faster_rcnn)
-// ------------------------------------------------------------------
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-#include <utility/timer.h>
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b) {
-  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
-  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  DType interS = width * height;
-  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-template <typename DType, int BLOCK_THREADS>
-__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes,
-                           unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
-
-  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-      block_boxes[threadIdx.x * 5 + i] =
-          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
-    const DType *cur_box = dev_boxes + cur_box_idx * 5;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (int i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}