working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 01ed382c · 01ed382c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
-name: build
-on:
-  push:
-    branches:
-      - master
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron:  '0 0 * * 6' # base builds run every saturday
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    env:
-      DOCKER_IMAGE_NAME: scrin/dev-spconv
-      DOCKER_FILE_PATH: ./Dockerfile
-    # TODO: create a action to reuse code. the problem is how to reuse docker-login.
-    steps:
-      - uses: actions/checkout@master
-      - name: Build Docker
-        run: |
-          docker build . --file ${{env.DOCKER_FILE_PATH}} --tag ${{env.DOCKER_IMAGE_NAME}}:latest
-          docker tag ${{env.DOCKER_IMAGE_NAME}}:latest ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
-      - name: Login to Registry
-        uses: azure/docker-login@v1
-        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
-      - name: Publish to Registry
-        run: |
-          docker push ${{env.DOCKER_IMAGE_NAME}}:latest
-          docker push ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
--- a/.gitignore
+++ b/.gitignore
@@ -107,3 +107,5 @@ venv.bak/
 .mypy_cache/
 .vscode
+__version__.py
\ No newline at end of file
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "third_party/pybind11"]
-	path = third_party/pybind11
-	url = https://github.com/pybind/pybind11.git
-[submodule "third_party/cutlass"]
-	path = third_party/cutlass
-	url = https://github.com/NVIDIA/cutlass
-[submodule "third_party/mp11"]
-	path = third_party/mp11
-	url = https://github.com/boostorg/mp11
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog
+## [2.0.0] - 2021-10-16
+### Changed
+- Change build system from cmake to pccm.
+- Change pytorch python code to spconv.pytorch
+- Rewrite All c++ code.
 ## [1.2.1] - 2020-06-04
 ### Changed
 - The subm indice pair generation speed is greatly increased by two tricks: 1. most subm conv use only kernelsize=3, so we can unroll loops to get 100% performance increase. 2. subm indice pairs have a property: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1], so we can get another 100% performance increase. 

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
-option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON)
-option(SPCONV_BuildCUDA "Build cuda code when BUILD_TESTING is enabled." ON)
-if (SPCONV_BuildCUDA)
-    project(SparseConv LANGUAGES CXX CUDA VERSION 1.1)
-else()
-    project(SparseConv LANGUAGES CXX VERSION 1.1)
-endif()
-if(WIN32) # true if windows (32 and 64 bit)
-    add_compile_definitions(TV_WINDOWS)
-endif()
-add_compile_definitions(PYTORCH_VERSION=${PYTORCH_VERSION})
-set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    add_compile_definitions(TV_DEBUG)
-endif()
-# add_compile_definitions(TV_LOG_KERNEL_INFO)
-find_package(Torch REQUIRED)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
-if (SPCONV_BuildCUDA)
-    set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}")
-    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
-    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
-    if(WIN32) # true if windows (32 and 64 bit)
-        set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
-    else()
-        set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
-    endif()
-    find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
-    find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
-    torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
-    string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
-    add_compile_definitions(TV_CUDA)
-endif()
-# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
-add_subdirectory(third_party/pybind11)
-set(ALL_LIBS ${TORCH_LIBRARIES}) 
-set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
-set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
-set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include)
-if (SPCONV_BuildCUDA)
-    set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
-    set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    add_subdirectory(src/cuhash)
-    add_subdirectory(src/spgemm)
-endif()
-add_subdirectory(src/spconv)
-add_subdirectory(src/utils)
-if (SPCONV_BuildTests)
-    include(CTest) #adds option BUILD_TESTING (default ON)
-    if(BUILD_TESTING)
-        enable_testing()
-        add_subdirectory(test)
-    endif()
-endif()
--- a/Dockerfile
+++ b/Dockerfile
-FROM scrin/dev:latest
-RUN PROBLEM_FILE=/usr/local/lib/python3.8/dist-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake && \
-    sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces;-fopenmp//g' $PROBLEM_FILE && \
-    sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces//g' $PROBLEM_FILE && \
-    cd /root && \
-    git clone --depth 1 --recursive https://www.github.com/traveller59/spconv.git && \
-    cd ./spconv && \
-    SPCONV_FORCE_BUILD_CUDA=1 python setup.py install
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
-   Copyright 2019-2020 Yan Yan
+   Copyright 2019-2021 Yan Yan
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

--- a/PERFORMANCE_GUIDE.md
+++ b/PERFORMANCE_GUIDE.md
-## Performance Guide
-### 1. Regular sparse conv is very slow
-Regular sparse convolution will greatly increase the number of active points. for 3x3x3 3D convolution, we can get at most 27x active points, which means next convolution will perform 27x slower!
-This problem can be solved by using submanifold convolution (SubMConv3d). This kind of sparse convolution doesn't generate new active points.
-**NEVER** use SparseConv3d except downsample data, **NEVER** use SparseConv3dTranspose, use SparseInverseConv3d instead.
-### 2. Large Spatial Shape cost too much GPU memory
-Our implementation use dense map to generate indices in GPU for sparse convolution, which means if your spatial shape is ```[batchSize=4, 1600, 1600, 40]```, it will cost ~2GB GPU memory.
-To solve this problem, you can use CPU algorithm (hash map) for first layer that has large shape, then convert generated indices to GPU and use GPU algorithm for downsampled data.
-Another way is use cuda hash. Unfortunately this library isn't stable enough, it should only be used when the spatial shape is very large.
-### 3. Stacked submanifold convolution can share same indice data
-When you using stacked subm convolution, there is no need to generate indice data again, but this can't be done automatically. you need to specify a unique key ```indice_key="c0"``` and use it for all stacked subm convolution.
-### 4. Different convolution algorithm may lead to different performance
-There are three kind of algorithm: ```Native```, ```Batch```, ```BatchGemmGather```. 
-* ```Native```: should be used for all submanifold convolutions. should be used when there are too much active points.
-* ```Batch```: **cost more GPU memory** should be used when number of active points is small.
-* ```BatchGemmGather```: **cost more GPU memory** can be used for regular convolution.
\ No newline at end of file
--- a/README.md
+++ b/README.md
+<!--
+ Copyright 2021 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 # SpConv: PyTorch Spatially Sparse Convolution Library
 [![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
-This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.
+# WORK IN PROGRESS, DON'T USE!!!
-2020-5-2, we add ConcatTable, JoinTable, AddTable, and Identity function to build ResNet and Unet in this version of spconv.
-## Docker:
-```docker pull scrin/dev-spconv```, contains python 3.8, cuda 10.1, fish shell, newest pytorch and tensorflow.
-## Install on Ubuntu 16.04/18.04
-* if you are using pytorch 1.4+ and encounter "nvcc fatal: unknown -Wall", you need to go to torch package dir and remove flags contains "-Wall" in INTERFACE_COMPILE_OPTIONS in Caffe2Targets.cmake. This problem can't be fixed in this project (to avoid this, I need to remove all torch dependency in cuda sources and drop half support).
-0. Use ```git clone xxx.git --recursive``` to clone this repo.
-1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboost-all-dev``` or download compressed files from boost official website and copy headers to include path.
-2. Download cmake >= 3.13.2, then add cmake executables to PATH.
-3. Ensure you have installed pytorch 1.0+ in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
-4. Run ```cd ./dist```, use pip to install generated whl file.
-## Install on Windows 10 with CUDA 10 and python 3.6 (python 3.7 may have problem, see [this](https://github.com/pytorch/pytorch/issues/17233))
-Since install newest driver and CUDA is very simple on windows, please use CUDA 10 on windows. 
-0. Install Visual Studio 2017. Use ```git clone xxx.git --recursive``` to clone this repo.
-1. Download compressed files from boost official website and copy headers (i.e. boost_1_69/boost) to spconv/include.
-2. Download and install cmake >= 3.13.2, select add cmake to User or System PATH.
-3. Ensure you have installed pytorch 1.0 in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
-4. Run ```cd ./dist```, use pip to install generated whl file.
-## Compare with SparseConvNet
-### Features
-* SparseConvNet's Sparse Convolution don't support padding and dilation, spconv support this.
-* spconv only contains sparse convolutions, the batchnorm and activations can directly use layers from torch.nn, SparseConvNet contains lots of their own implementation of layers such as batchnorm and activations.
-### Speed
-* spconv is faster than SparseConvNet due to gpu indice generation and gather-gemm-scatter algorithm. SparseConvNet use hand-written gemm which is slow.
-## Usage
-### SparseConvTensor
-```Python
-features = # your features with shape [N, numPlanes]
-indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0]
-spatial_shape = # spatial shape of your sparse tensor, spatial_shape[i] is shape of indices[:, 1 + i].
-batch_size = # batch size of your sparse tensor.
-x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size)
-x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor.
-print(x.sparity) # helper function to check sparity. 
-```
-### Sparse Convolution
-```Python
-import spconv
-from torch import nn
-class ExampleNet(nn.Module):
-    def __init__(self, shape):
-        super().__init__()
-        self.net = spconv.SparseSequential(
-            spconv.SparseConv3d(32, 64, 3), # just like nn.Conv3d but don't support group and all([d > 1, s > 1])
-            nn.BatchNorm1d(64), # non-spatial layers can be used directly in SparseSequential.
-            nn.ReLU(),
-            spconv.SubMConv3d(64, 64, 3, indice_key="subm0"),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-            # when use submanifold convolutions, their indices can be shared to save indices generation time.
-            spconv.SubMConv3d(64, 64, 3, indice_key="subm0"),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-            spconv.SparseConvTranspose3d(64, 64, 3, 2),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-            spconv.ToDense(), # convert spconv tensor to dense and convert it to NCHW format.
-            nn.Conv3d(64, 64, 3),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-        )
-        self.shape = shape
-    def forward(self, features, coors, batch_size):
-        coors = coors.int() # unlike torch, this library only accept int coordinates.
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
-        return self.net(x)# .dense()
-```
-### Inverse Convolution
+## Breaking changes in Spconv 2.x
-Inverse sparse convolution means "inv" of sparse convolution. the output of inverse convolution contains same indices as input of sparse convolution.
+* ```spconv.xxx``` move to ```spconv.pytorch.xxx```, change all ```import spconv``` to ```import spconv.pytorch as spconv``` and ```from spconv.xxx import``` to ```from spconv.pytorch.xxx import```.
+* ```use_hash``` in Sparse Convolution is removed, we only use hash table in 2.x.
+* weight layout has been changed to RSKC (native algorithm) or KRSC (implicit gemm), no longer RSCK (spconv 1.x). RS is kernel size, C is input channel, K is output channel.
+* all util ops are removed (pillar scatter/nms/...)
+* VoxelGenerator has been replaced by Point2VoxelGPU[1-4]d/Point2VoxelCPU[1-4]d.
+* spconv 2.x don't support CPU for now
-Inverse convolution usually used in semantic segmentation.
+## News in Spconv 2.0.0
-```Python
+* training/inference speed is increased
-class ExampleNet(nn.Module):
+* support int8/tensor core
-    def __init__(self, shape):
+* doesn't depend on pytorch binary. 
-        super().__init__()
+* If your GPU has tensor core, try mixed precision training in spconv 2.x!
-        self.net = spconv.SparseSequential(
+* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
-            spconv.SparseConv3d(32, 64, 3, 2, indice_key="cp0"),
-            spconv.SparseInverseConv3d(64, 32, 3, indice_key="cp0"), # need provide kernel size to create weight
-        )
-        self.shape = shape
-    def forward(self, features, coors, batch_size):
+## TODO in Spconv 2.x
-        coors = coors.int()
+- [ ] Ampere (A100 / RTX 3000 series) feature support (work in progress)
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
+- [ ] torch QAT support (work in progress)
-        return self.net(x)
+- [ ] TensorRT (torch.fx based)
-```
+- [ ] Build C++ only package
+- [ ] JIT compilation for CUDA kernels
+- [ ] Document (low priority)
+- [ ] CPU support (low priority)
-### Utility functions
+## Install
-* convert point cloud to voxel
+You need to install python >= 3.6 first to use spconv 2.x.
-```Python
+You need to install CUDA toolkit first before using prebuilt binaries or build from source.
-voxel_generator = spconv.utils.VoxelGenerator(
+You need at least CUDA 10.2 to build and run spconv 2.x. We won't offer any support for CUDA < 10.2.
-    voxel_size=[0.1, 0.1, 0.1], 
-    point_cloud_range=[-50, -50, -3, 50, 50, 1],
-    max_num_points=30,
-    max_voxels=40000
-)
-points = # [N, 3+] tensor.
+### Prebuilt
-voxels, coords, num_points_per_voxel = voxel_generator.generate(points)
-```
-## Implementation Details
+We offer python 3.6-3.10 and cuda 10.2/11.1/11.4 prebuilt binaries for linux (manylinux) and windows 10/11.
-This implementation use gather-gemm-scatter framework to do sparse convolution.
+We will offer prebuilts for CUDA versions supported by latest pytorch release. For example, pytorch 1.9 support cuda 10.2 and 11.1, so we support them too.
-## Projects using spconv:
+For Linux users, you need to install pip >= 20.3 first to install prebuilt.
-* [second.pytorch](https://github.com/traveller59/second.pytorch): Point Cloud Object Detection in KITTI Dataset.
+```pip install spconv-cu102``` for CUDA 10.2
-## Authors
+```pip install spconv-cu111``` for CUDA 11.1
-* **Yan Yan** - *Initial work* - [traveller59](https://github.com/traveller59)
+```pip install spconv-cu114``` for CUDA 11.4
-* **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo)
+### Build from source
-## Third party libraries
+You need to rebuild ```cumm``` first if you are build along a CUDA version that not provided in prebuilts.
-* [CUDPP](https://github.com/cudpp/cudpp): A cuda library. contains a cuda hash implementation.
+#### Linux
-* [robin-map](https://github.com/Tessil/robin-map): A fast c++ hash library. almost 2x faster than std::unordered_map in this project.
+1. install build-essential, install CUDA
+2. run ```export SPCONV_DISABLE_JIT="1"```
+3. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
-* [pybind11](https://github.com/pybind/pybind11): A head-only python c++ binding library.
+#### Windows 10/11
-* [prettyprint](https://github.com/louisdx/cxx-prettyprint): A head-only library for container print.
+1. install visual studio 2019 or newer. make sure C++ development package is installed. install CUDA
+2. set [powershell script execution policy](https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_execution_policies?view=powershell-7.1)
+3. start a new powershell, run ```tools/msvc_setup.ps1```
+4. run ```$Env:SPCONV_DISABLE_JIT = "1"```
+5. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
-## License
+## Note
-This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details
+The work is done when the author is an employee at Tusimple.
-The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.
+## LICENSE
-The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
+Apache 2.0
\ No newline at end of file
--- a/codeai-devops.yaml
+++ b/codeai-devops.yaml
-global:
-    console_url: localhost:50091
-    envs: 
-        PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python # c++ libprotobuf and python will conflicit
-analyzers: # only one analyzer is allowed for one type for now.
-    PythonAnalyzer:
-    SimpleCPPAnalyzer: # $<astex> devops.devs = ["_ci_dev_xxx"] </astex> is allowed in raw sources.
-        includes: ["*.cpp", "*.cu", "*.cc", "*.h", "*.hpp", "*.hxx", "*.cxx"]
-observers:
-    # run test functions when that function change or marked function change.
-    test:
-        type: TestObserver
-    # run dev functions when that function change or marked function change.
-    dev:
-        type: DevObserver
-        pattern: _ci_dev_.*
-    clangdev:
-        type: CPPDevObserver
-        main_pattern: dev_.*\.(cc|cpp|cxx)
-        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx)
-        compiler: clang++
-        executable: build/codeai_dev
-        includes: [
-            include,
-            /usr/local/cuda/include,
-            /home/yy/anaconda3/include,
-            /home/yy/anaconda3/include/python3.7m,
-            third_party/pybind11/include,
-            third_party/include,
-            /home/yy/library/boost_1_72_0,
-        ]
-        libpaths: [
-            /home/yy/anaconda3/lib,
-        ]
-        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg]
-        std: c++14
-        options: [-Wall, -Wextra]
-    cudadev:
-        type: CPPDevObserver
-        main_pattern: dev_.*\.cu
-        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
-        compiler: nvcc
-        executable: build/codeai_dev_cuda
-        run_cmd: [$(executable)]
-        sources: []
-        includes: [
-            include,
-            /usr/local/cuda/include,
-            /home/yy/anaconda3/include,
-            /home/yy/anaconda3/include/python3.7m,
-            third_party/pybind11/include,
-            third_party/cutlass/include,
-        ]
-        libpaths: [
-            /usr/local/cuda/lib64,
-            /home/yy/anaconda3/lib,
-        ]
-        libraries: [-lpython3.7m, -lcudart, -lcublas, -ljpeg]
-        std: c++14
-        options: [
-            -Wno-deprecated-declarations,
-            # "-gencode=arch=compute_52,code=sm_52",
-            "-gencode=arch=compute_61,code=sm_61",
-            # "-gencode=arch=compute_60,code=sm_60",
-            # "-gencode=arch=compute_70,code=sm_70",
-            # "-gencode=arch=compute_75,code=sm_75",
-        ]
-    torchdev:
-        type: CPPDevObserver
-        main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
-        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
-        compiler: nvcc
-        executable: build/codeai_dev_torch
-        run_cmd: [$(executable)]
-        fail_cmds: # run cmd when pervious run fail with retcode
-            -6: [gdb, -ex, run, -ex, bt, -ex, quit, $(executable)] # segfault in unix
-        includes: [
-            include,
-            /home/yy/anaconda3/lib/python3.7/site-packages/torch/include,
-            /home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include,
-            /usr/local/cuda/include,
-            /home/yy/anaconda3/include,
-            /home/yy/anaconda3/include/python3.7m,
-            third_party/pybind11/include,
-            third_party/cutlass/include,
-        ]
-        libpaths: [
-            /home/yy/anaconda3/lib/python3.7/site-packages/torch/lib,
-            /usr/local/cuda/lib64,
-            /home/yy/anaconda3/lib,
-        ]
-        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread, 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-lc10\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""]
-        std: c++14
-        # options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
-        options: [
-            -Wno-deprecated-declarations,
-            --expt-relaxed-constexpr,
-            "-gencode=arch=compute_61,code=sm_61",
-            -D_GLIBCXX_USE_CXX11_ABI=0,
-        ]
--- a/docs/API.md
+++ b/docs/API.md
+<!--
+ Copyright 2021 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
+<!--
+ Copyright 2021 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
--- a/docs/PERFORMANCE_GUIDE.md
+++ b/docs/PERFORMANCE_GUIDE.md
+<!--
+ Copyright 2021 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
--- a/example/mnist_sparse.py
+++ b/example/mnist_sparse.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from __future__ import print_function
 import argparse
 import torch
-import spconv
+import spconv.pytorch as spconv
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim

--- a/example/voxel_gen.py
+++ b/example/voxel_gen.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np 
+from cumm import tensorview as tv 
+from spconv.utils import Point2VoxelCPU3d
+def main():
+    gen = Point2VoxelCPU3d(
+        vsize_xyz=[0.1, 0.1, 0.1], 
+        coors_range_xyz=[-80, -80, -2, 80, 80, 6], 
+        num_point_features=3, 
+        max_num_voxels=5000, 
+        max_num_points_per_voxel=5)
+    pc = np.random.uniform(-10, 10, size=[1000, 3])
+    pc_tv = tv.from_numpy(pc)
+    # generate voxels, note that voxels_tv reference to a persistent buffer in generator,
+    # so we can't run it in multi-thread.
+    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel(pc_tv)
+    # run voxel gen and FILL MEAN VALUE to voxel remain
+    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_empty_mean(pc_tv)
+if __name__ == "__main__":
+    main()
--- a/include/cuhash/cuda_util.h
+++ b/include/cuhash/cuda_util.h
-#ifndef _CUDA_UTIL_H_
-#define _CUDA_UTIL_H_
-#if CUDART_VERSION >= 4000
-#define CUDA_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
-#else
-#define CUDA_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
-#endif
-#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
-  {                                                                            \
-    cudaError err = call;                                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
-              __LINE__, cudaGetErrorString(err));                              \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
-//! Check for CUDA error
-#ifdef _DEBUG
-#define CUDA_CHECK_ERROR(errorMessage)                                         \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    err = CUDA_DEVICE_SYNCHRONIZE();                                           \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#else
-#define CUDA_CHECK_ERROR(errorMessage)                                         \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#endif
-#endif
\ No newline at end of file
--- a/include/cuhash/debugging.h
+++ b/include/cuhash/debugging.h
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-/**
- * @file
- * debugging.h
- *
- * @brief Debugging/statistics/performance utilities header for hash tables.
- */
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
-#include "definitions.h"
-#include <cuda_runtime_api.h>
-#include <vector_types.h>
-#include <algorithm>
-namespace cuhash {
-//! @name Debugging functions
-/// @{
-void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
-                                const unsigned table_size,
-                                const uint2 *constants,
-                                const unsigned kNumHashFunctions);
-//! Output how many probes were required by each thread to perform the
-//! retrieval.
-/*! @param[in]  n_queries           Number of queries being performed.
- *  @param[in]  d_retrieval_probes  Device array: the number of probes taken for
- * each thread's retrieval.
- *  @param[in]  n_functions         Number of hash functions used.
- */
-void OutputRetrievalStatistics(const unsigned n_queries,
-                               const unsigned *d_retrieval_probes,
-                               const unsigned n_functions);
-//! Outputs information about how many iterations threads required to
-//! successfully cuckoo hash.
-/*! @param[in]  n                       Number of keys in the input.
- *  @param[in]  d_iterations_taken      Device mem: Number of iterations each
- * thread took.
- *  @param[in]  d_max_iterations_taken  Device mem: Largest number of iterations
- * taken by any thread.
- */
-void OutputBuildStatistics(const unsigned n,
-                           const unsigned *d_iterations_taken);
-//! Prints out the contents of the stash.
-void PrintStashContents(const Entry *d_stash);
-//! Checks if a key is assigned the same slot by different hash functions.
-bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
-                           const unsigned *d_keys, const unsigned table_size,
-                           uint2 *constants);
-/// @}
-}; // namespace cuhash
-#endif
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/include/cuhash/definitions.h
+++ b/include/cuhash/definitions.h
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-/**
- * @file definitions.h
- *
- * @brief Stores configuration flags and definitions for hard-coded values in
- * hash table implementations.
- */
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
-#include <cstdio>
-#include <limits>
-#include <tensorview/tensorview.h>
-/* --------------------------------------------------------------------------
-   Debugging.
-   -------------------------------------------------------------------------- */
-#ifdef _DEBUG
-//! Forces the hash functions to generate a full set of slots for each key when
-//! not using subtables.
-// #define FORCEFULLY_GENERATE_NO_CYCLES
-//! Count how many iterations are taken to insert/find items.
-#define TRACK_ITERATIONS
-//! Count how many items fail to be inserted when the hash table fails to build.
-#define COUNT_UNINSERTED
-//! Take some statistics on the hash functions.
-#define TAKE_HASH_FUNCTION_STATISTICS
-#ifdef TAKE_HASH_FUNCTION_STATISTICS
-//! Determine how many keys hash into each table slot.
-#define COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-//! Determine how many unique slots a key is assigned.
-#define COUNT_HOW_MANY_HAVE_CYCLES
-#endif
-#endif
-#ifdef USE_DAN_OUTPUT
-#include <Utilities/output.h>
-//! Logs any error messages.
-inline void PrintMessage(const char *message, const bool error = false) {
-  PrintIndentedMessage(message, error);
-}
-#else
-//! Prints a message out to the console.
-inline void PrintMessage(const char *message, const bool error = false) {
-  if (error) {
-    printf("cudahash: %s\n", message);
-  } else {
-    printf("%s\n", message);
-  }
-}
-#endif
-/* -------------------------------------------------------------------------
-   Hash table constants and definitions.
-   ------------------------------------------------------------------------- */
-namespace cuhash {
-/**
- * \addtogroup cudpp_hash_data_structures
- *
- * @{
- */
-typedef unsigned long long
-    Entry; //!< A key and its value are stored in a 64-bit number.  The key is
-           //!< stored in the upper 32 bits.
-const unsigned kMaxRestartAttempts = 10; //!< Number of build attempts.
-const unsigned kKeyEmpty = 0xffffffffu; //!< Signifies empty slots in the table.
-const unsigned kNotFound =
-    0xffffffffu; //!< Signifies that a query key was not found.
-const unsigned kMaxHashFunctions =
-    5; //!< Maximum number of hash functions allowed.
-const unsigned kStashSize =
-    101; //!< How many slots the stash hash table contains.
-//! Value indicating that a hash table slot has no valid item within it.
-const Entry kEntryEmpty = Entry(kKeyEmpty) << 32;
-//! Value returned when a query fails.
-const Entry kEntryNotFound = (Entry(kKeyEmpty) << 32) + kNotFound;
-//! Number of threads to put in a thread block.
-const unsigned kBlockSize = 64;
-//! Number of blocks to put along each axis of the grid.
-const unsigned kGridSize = 16384;
-//! Minimum table sizes for 2 through 5 functions.
-const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(),
-                                     std::numeric_limits<float>::max(),
-                                     2.01f,
-                                     1.1f,
-                                     1.03f,
-                                     1.02f};
-/** @} */ // end cudpp_hash_data_structures
-}; // namespace cuhash
-#endif
--- a/include/cuhash/hash_functions.h
+++ b/include/cuhash/hash_functions.h
-/*! @file hash_functions.h
- *  @brief Hash function code.
- */
-#ifndef HASH_FUNCTIONS__H
-#define HASH_FUNCTIONS__H
-#include "definitions.h"
-#include <tensorview/tensorview.h>
-#include <vector_types.h>
-namespace cuhash {
-//! Prime number larger than the largest practical hash table size.
-const unsigned kPrimeDivisor = 4294967291u;
-// https://www.alpertron.com.ar/ECM.HTM
-// const unsigned long kPrimeDivisor = 18446744073709551557lu
-// const long kPrimeDivisor = 9223372036854775783l
-// const Entry kPrimeDivisor = 4300000013lu;
-// const unsigned kPrimeDivisor = 334214459;
-//! Generates a set of linear hash function constants.
-/*! @param[in]  N           Number of hash functions.
-    @param[out] constants   CPU pointer to the constants.
-    @param[in]  num_keys    Debug only: How many keys are in the input.
-    @param[in]  d_keys      Debug only: Device memory array containing the input
-   keys.
-    @param[in]  table_size  Debug only: Size of the hash table.
- */
-void GenerateFunctions(const unsigned N, const unsigned num_keys,
-                       const unsigned *d_keys, const unsigned table_size,
-                       uint2 *constants);
-//! Container for all of the hash functions.
-template <unsigned N> struct Functions {
-  //! The constants required for all of the hash functions, including the stash.
-  //! Each function requires 2.
-  uint2 constants[N];
-  //! Generate new hash function constants.
-  /*! The parameters are only used for debugging and examining the key
-     distribution. \param[in] num_keys   Debug: Number of keys in the input.
-      \param[in] d_keys     Debug: Device array of the input keys.
-      \param[in] table_size Debug: Size of the hash table.
-  */
-  void Generate(const unsigned num_keys, const unsigned *d_keys,
-                const unsigned table_size) {
-    GenerateFunctions(N, num_keys, d_keys, table_size, constants);
-  }
-};
-//! Computes the value of a hash function for a given key.
-/*! \param[in] constants  Constants used by the hash function.
-  ! \param[in] key        Key being hashed.
-  ! \returns              The value of the hash function for the key.
- */
-inline __device__ __host__ unsigned hash_function_inner(const uint2 constants,
-                                                        const unsigned key) {
-#if 1
-  // Fast version.
-  return ((constants.x ^ key) + constants.y) % kPrimeDivisor;
-#else
-  // Slow version.
-  return ((unsigned long long)constants.x * key + constants.y) % kPrimeDivisor;
-#endif
-}
-//! Computes the value of a hash function for a given key.
-/*! \param[in] functions        All of the constants used by the hash functions.
-  ! \param[in] which_function   Which hash function is being used.
-  ! \param[in] key              Key being hashed.
-  ! \returns                    The value of a hash function with a given key.
- */
-template <unsigned kNumHashFunctions>
-TV_HOST_DEVICE_INLINE unsigned
-hash_function(const Functions<kNumHashFunctions> functions,
-              const unsigned which_function, const unsigned key) {
-  return hash_function_inner(functions.constants[which_function], key);
-}
-//! Simple hash function used by the stash.
-TV_HOST_DEVICE_INLINE
-unsigned stash_hash_function(const uint2 stash_constants, const unsigned key) {
-  return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
-}
-unsigned generate_random_uint32();
-}; // namespace cuhash
-#endif
--- a/include/cuhash/hash_table.cuh
+++ b/include/cuhash/hash_table.cuh
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-/**
- * @file hash_table.cuh
- *
- * @brief Implements kernel and __device__ functions for a basic hash table.
- */
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
-#include "definitions.h"
-#include "hash_table.h"
-#include <driver_types.h>
-#include <tensorview/tensorview.h>
-namespace cuhash {
-//! Makes an 64-bit Entry out of a key-value pair for the hash table.
-TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {
-  return (Entry(key) << 32) + value;
-}
-//! Returns the key of an Entry.
-TV_HOST_DEVICE_INLINE unsigned get_key(Entry entry) {
-  return (unsigned)(entry >> 32);
-}
-//! Returns the value of an Entry.
-TV_HOST_DEVICE_INLINE unsigned get_value(Entry entry) {
-  return (unsigned)(entry & 0xffffffff);
-}
-//! @name Internal
-//! @brief Functions used for building the hash table.
-//! @{
-//! Fills the entire array with a specific value.
-template <class T>
-__global__ void clear_table(const unsigned table_size, const T value,
-                            T *table) {
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-  if (thread_index < table_size) {
-    table[thread_index] = value;
-  }
-}
-//! Determine where in the hash table the key could be located.
-template <unsigned kNumHashFunctions>
-__device__ void KeyLocations(const Functions<kNumHashFunctions> constants,
-                             const unsigned table_size, const unsigned key,
-                             unsigned locations[kNumHashFunctions]) {
-// Compute all possible locations for the key in the big table.
-#pragma unroll
-  for (int i = 0; i < kNumHashFunctions; ++i) {
-    locations[i] = hash_function(constants, i, key) % table_size;
-  }
-}
-//! @}
-/* --------------------------------------------------------------------------
-   Retrieval functions.
-   -------------------------------------------------------------------------- */
-//! Answers a single query.
-/*! @ingroup PublicInterface
- *  @param[in]  key                   Query key
- *  @param[in]  table_size            Size of the hash table
- *  @param[in]  table                 The contents of the hash table
- *  @param[in]  constants             The hash functions used to build the table
- *  @param[in]  stash_constants       The hash function used to build the stash
- *  @param[in]  stash_count           The number of items in the stash
- *  @param[out] num_probes_required   Debug only: The number of probes required
- * to resolve the query.
- *  @returns The value of the query key, if the key exists in the table.
- * Otherwise, \ref kNotFound will be returned.
- */
-template <unsigned kNumHashFunctions>
-__device__ unsigned
-retrieve(const unsigned query_key, const unsigned table_size,
-         const Entry *table, const Functions<kNumHashFunctions> constants,
-         const uint2 stash_constants, const unsigned stash_count,
-         unsigned *num_probes_required = NULL) {
-  // Identify all of the locations that the key can be located in.
-  unsigned locations[kNumHashFunctions];
-  KeyLocations(constants, table_size, query_key, locations);
-  // Check each location until the key is found.
-  unsigned num_probes = 1;
-  Entry entry = table[locations[0]];
-  unsigned key = get_key(entry);
-#pragma unroll
-  for (unsigned i = 1; i < kNumHashFunctions; ++i) {
-    if (key != query_key && key != kNotFound) {
-      num_probes++;
-      entry = table[locations[i]];
-      key = get_key(entry);
-    }
-  }
-  // Check the stash.
-  if (stash_count && get_key(entry) != query_key) {
-    num_probes++;
-    const Entry *stash = table + table_size;
-    unsigned slot = stash_hash_function(stash_constants, query_key);
-    entry = stash[slot];
-  }
-#ifdef TRACK_ITERATIONS
-  if (num_probes_required) {
-    *num_probes_required = num_probes;
-  }
-#endif
-  if (get_key(entry) == query_key) {
-    return get_value(entry);
-  } else {
-    return kNotFound;
-  }
-}
-//! Perform a retrieval from a basic hash table.  Each thread manages a single
-//! query.
-template <unsigned kNumHashFunctions>
-__global__ void hash_retrieve(const unsigned n_queries, const unsigned *keys_in,
-                              const unsigned table_size, const Entry *table,
-                              const Functions<kNumHashFunctions> constants,
-                              const uint2 stash_constants,
-                              const unsigned stash_count, unsigned *values_out,
-                              unsigned *num_probes_required = NULL) {
-  // Get the key.
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-  if (thread_index >= n_queries)
-    return;
-  unsigned key = keys_in[thread_index];
-  values_out[thread_index] = retrieve<kNumHashFunctions>(
-      key, table_size, table, constants, stash_constants, stash_count,
-      (num_probes_required ? num_probes_required + thread_index : NULL));
-}
-/* --------------------------------------------------------------------------
-   Build a cuckoo hash table.
-   -------------------------------------------------------------------------- */
-//! @name Internal
-//! @{
-//! Determine where to insert the key next.  The hash functions are used in
-//! round-robin order.
-template <unsigned kNumHashFunctions>
-__device__ unsigned
-determine_next_location(const Functions<kNumHashFunctions> constants,
-                        const unsigned table_size, const unsigned key,
-                        const unsigned previous_location) {
-  // Identify all possible locations for the entry.
-  unsigned locations[kNumHashFunctions];
-#pragma unroll
-  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
-    locations[i] = hash_function(constants, i, key) % table_size;
-  }
-  // Figure out where the item should be inserted next.
-  unsigned next_location = locations[0];
-#pragma unroll
-  for (int i = kNumHashFunctions - 2; i >= 0; --i) {
-    next_location =
-        (previous_location == locations[i] ? locations[i + 1] : next_location);
-  }
-  return next_location;
-}
-//! Attempts to insert a single entry into the hash table.
-/*! This process stops after a certain number of iterations.  If the thread is
-    still holding onto an item because of an eviction, it tries the stash.
-    If it fails to enter the stash, it returns false.
-    Otherwise, it succeeds and returns true.
- */
-template <unsigned kNumHashFunctions>
-__device__ bool
-insert(const unsigned table_size, const Functions<kNumHashFunctions> constants,
-       const uint2 stash_constants, const unsigned max_iteration_attempts,
-       Entry *table, unsigned *stash_count, Entry entry,
-       unsigned *iterations_used) {
-  unsigned key = get_key(entry);
-  // The key is always inserted into its first slot at the start.
-  unsigned location = hash_function(constants, 0, key) % table_size;
-  // Keep inserting until an empty slot is found or the eviction chain grows too
-  // large.
-  for (unsigned its = 1; its <= max_iteration_attempts; its++) {
-    // Insert the new entry.
-    entry = atomicExch(&table[location], entry);
-    key = get_key(entry);
-    // If no key was evicted, we're done.
-    if (key == kKeyEmpty) {
-      *iterations_used = its;
-      break;
-    }
-    // Otherwise, determine where the evicted key will go.
-    location = determine_next_location(constants, table_size, key, location);
-  }
-  if (key != kKeyEmpty) {
-    // Shove it into the stash.
-    unsigned slot = stash_hash_function(stash_constants, key);
-    Entry *stash = table + table_size;
-    Entry replaced_entry = atomicCAS(stash + slot, kEntryEmpty, entry);
-    if (replaced_entry != kEntryEmpty) {
-      return false;
-    } else {
-      atomicAdd(stash_count, 1);
-    }
-  }
-  return true;
-}
-// Build a basic hash table, using one big table.
-template <unsigned kNumHashFunctions>
-__global__ void CuckooHash(const unsigned n_entries, const unsigned *keys,
-                           const unsigned *values, const unsigned table_size,
-                           const Functions<kNumHashFunctions> constants,
-                           const unsigned max_iteration_attempts, Entry *table,
-                           uint2 stash_constants, unsigned *stash_count,
-                           unsigned *failures,
-                           unsigned *iterations_taken = nullptr) {
-  // Check if this thread has an item and if any previous threads failed.
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-  if (thread_index >= n_entries || *failures)
-    return;
-  Entry entry = make_entry(keys[thread_index], values[thread_index]);
-  unsigned iterations = 0;
-  bool success = insert<kNumHashFunctions>(
-      table_size, constants, stash_constants, max_iteration_attempts, table,
-      stash_count, entry, &iterations);
-  if (success == false) {
-    // The eviction chain grew too large.  Report failure.
-#ifdef COUNT_UNINSERTED
-    atomicAdd(failures, 1);
-#else
-    *failures = 1;
-#endif
-  }
-#ifdef TRACK_ITERATIONS
-  iterations_taken[thread_index] = iterations;
-#endif
-}
-//! @}
-}; // namespace cuhash
-#endif
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End: