Merge branch 'master' into develop

d03b947a · yan.yan · 9d1e33d6 · 8aa0f1f7 · d03b947a · d03b947a
Commit d03b947a authored Nov 29, 2021 by yan.yan
20 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -24,11 +24,32 @@ jobs:
        with:
          filters: |
            needbuild:
+              - '.github/workflows/**'
              - 'setup.py'
              - 'spconv/csrc/**'
              - 'spconv/algo.py'
              - 'spconv/core.py'
              - 'pyproject.toml'
+      - name: Install Boost
+        env:
+          CUDA_VERSION: ${{ matrix.cuda-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+          cuda: ${{ matrix.cuda-version }}
+          BOOST_VERSION: boost_1_77_0
+        if: |
+          (
+            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
+            (
+              (steps.changes.outputs.needbuild == 'true') && 
+              (env.PYTHON_VERSION == '3.10')
+            )
+          )
+        shell: powershell
+        run: |
+          $ProgressPreference = 'SilentlyContinue'
+          Invoke-WebRequest -Uri "https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.zip" -UseBasicParsing -OutFile $HOME/boost.zip
+          Expand-Archive $HOME/boost.zip -DestinationPath $HOME/boost
      - name: Install CUDA
        env:
          CUDA_VERSION: ${{ matrix.cuda-version }}
@@ -43,7 +64,9 @@ jobs:
            )
          )
        shell: powershell
-        run: .\tools\install_windows_cuda.ps1
+        run: |
+          .\tools\install_windows_cuda.ps1
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
@@ -59,6 +82,8 @@ jobs:
        env:
          CUDA_VERSION: ${{ matrix.cuda-version }}
          PYTHON_VERSION: ${{ matrix.python-version }}
+          BOOST_VERSION: boost_1_77_0
+          CUMM_CUDA_VERSION: ${{ matrix.cuda-version }}
        if: |
          (env.CUDA_VERSION != '') && (
            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
@@ -68,10 +93,11 @@ jobs:
            )
          )
        run: |
-          $Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
          $Env:CUMM_CUDA_ARCH_LIST = "all"
          $Env:SPCONV_DISABLE_JIT = "1"
          pip install pccm pybind11
+          # download boost header only
+          $Env:BOOST_ROOT = "$HOME/boost/boost_1_77_0"
          # ls "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${{ matrix.cuda-version }}\include\thrust"
          python -m build --wheel --outdir dist/ .
        shell: powershell
@@ -99,6 +125,7 @@ jobs:
        with:
          filters: |
            needbuild:
+              - '.github/workflows/**'
              - 'setup.py'
              - 'spconv/csrc/**'
              - 'spconv/algo.py'
@@ -111,10 +138,15 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install pep build
+        env:
+          BOOST_VERSION: boost_1_77_0
        run: |
          python -m pip install build --user
          python -m pip install --upgrade pip twine wheel
          python -m pip install pytest setuptools
+          mkdir -p third_party
+          wget https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/$BOOST_VERSION.zip -O third_party/boost.zip
+          unzip third_party/boost.zip -d third_party/boost
      - name: Build a cuda wheel
        env:
@@ -122,6 +154,7 @@ jobs:
          PYTHON_VERSION: ${{ matrix.python-version }}
          DOCKER_IMAGE: scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel-1.0.0
          PLAT: manylinux2014_x86_64
+          BOOST_VERSION: boost_1_77_0
        if: |
          (env.CUDA_VERSION != '') && (
            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
@@ -132,7 +165,10 @@ jobs:
          )
        run: |
          chmod +x tools/build-wheels.sh
-          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} -v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh"
+          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} \
+           -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} \
+           -e BOOST_ROOT=/io/third_party/boost/$BOOST_VERSION \
+           -v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh"
      - name: Build a cpu wheel
        env:
@@ -140,6 +176,7 @@ jobs:
          PYTHON_VERSION: ${{ matrix.python-version }}
          DOCKER_IMAGE: scrin/manylinux2014-cuda:cu114-devel-1.0.0
          PLAT: manylinux2014_x86_64
+          BOOST_VERSION: boost_1_77_0
        if: |
          (env.CUDA_VERSION == '') && (
            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
@@ -150,7 +187,10 @@ jobs:
          )
        run: |
          chmod +x tools/build-wheels.sh
-          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} -v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh"
+          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} \
+            -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} \
+            -e BOOST_ROOT=/io/third_party/boost/$BOOST_VERSION \
+            -v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh"
      - name: Publish a Python distribution to PyPI
        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog
+## [2.1.17] - 2021-11-29
+### Fixed
+- Fix a bug in sparse add.
+### Added
+- Add more wrong usage check
+- Add insert_exist_keys for hash table
+## [2.1.16] - 2021-11-28
+### Fixed
+- Fix strange compile problem in windows
+## [2.1.15] - 2021-11-28
+### Fixed
+- Fix missing pccm.Class in setup.py
+## [2.1.14] - 2021-11-28
+### Added 
+- Add hash table
+- update cumm version
+- Add AddTableMisaligned for sptensors with same shape but different indices.
+### Fixed
+- Fix a bug already fixed in 2.1.10 but introduced in 2.1.12 again.
+## [2.1.13] - 2021-?-?
+### Added 
+- Add some ops from spconv 1.x, see spconv.utils for more details.
+- Add some debug tool for users to attach more info in issue.
 ## [2.1.12] - 2021-11-23
 ### Added 
 - Add a method for voxel generator to get pc_voxel_id, which is usually used in semantic segmentation

--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ Spconv 1.x users **NEED READ [THIS](docs/SPCONV_2_BREAKING_CHANGEs.md)** before
 * fp32 (not tf32) training/inference speed is increased (+50~80%)
 * fp16 training/inference speed is greatly increased when your layer support tensor core (channel size must be multiple of 8).
 * int8 op is ready, but we still need some time to figure out how to run int8 in pytorch.
-* [doesn't depend on pytorch binary](docs/FAQ.md#What-does-no-dependency-on-pytorch-mean), but you may need at least pytorch >= 1.6.0 to run spconv 2.x.
+* [doesn't depend on pytorch binary](docs/FAQ.md#What-does-no-dependency-on-pytorch-mean), but you may need at least pytorch >= 1.5.0 to run spconv 2.x.
 * since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
 ## Spconv 2.x Development and Roadmap
@@ -108,18 +108,32 @@ CUDA 11.1 will be removed in spconv 2.2 because pytorch 1.10 don't provide prebu
 ```pip install spconv-cu114``` for CUDA 11.4
-**NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) **in Linux**. for example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed.
+**NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) in **CUDA >= 11.0** because of [CUDA Minor Version Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility). For example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed.
+For CUDA 10, we don't know whether ```spconv-cu102``` works with CUDA 10.0 and 10.1. Users can have a try.
 **NOTE** In Linux, you can install spconv-cuxxx without install CUDA to system! only suitable NVIDIA driver is required. for CUDA 11, we need driver >= 450.82.
+#### Prebuilt GPU Support Matrix
+See [this page](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) to check supported GPU names by arch.
+| CUDA version | GPU Arch List  |
+| -------------- |:---------------------:|
+| 10.2 | 50,52,60,61,70,75     | 
+| 11.x       | 52,60,61,70,75,80,86     | 
+| 12.x       | 60,61,70,75,80,86,90     | 
 ### Build from source for development (JIT, recommend)
 The c++ code will be built automatically when you change c++ code in project.
-For NVIDIA Embedded Platforms, you need to specify cuda arch before build: ```export CUMM_CUDA_ARCH_LIST="7.2"``` for xavier.
+For NVIDIA Embedded Platforms, you need to specify cuda arch before build: ```export CUMM_CUDA_ARCH_LIST="7.2"``` for xavier, ```export CUMM_CUDA_ARCH_LIST="6.2"``` for TX2, ```export CUMM_CUDA_ARCH_LIST="8.7"``` for orin.
 You need to remove ```cumm``` in ```requires``` section in pyproject.toml after install editable ```cumm``` and before install spconv due to pyproject limit (can't find editable installed ```cumm```).
+You need to ensure ```pip list | grep spconv``` and ```pip list | grep cumm``` show nothing before install editable spconv/cumm.
 #### Linux
 0. uninstall spconv and cumm installed by pip
@@ -157,7 +171,9 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that
 5. run ```pip install pccm cumm wheel```
 6. run ```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
+## Know issues
+* Spconv 2.x F16 runs slow in A100. 
 ## Note

--- a/docs/BENCHMARK.md
+++ b/docs/BENCHMARK.md
@@ -16,7 +16,7 @@
 ## Simple Benchmark
-### Network Benchmark without batchnorm (F32/F16) in RTX 3080 Laptop GPU
+### Network Benchmark without batchnorm (F32/F16) in RTX 3080 Laptop GPU 150W
 Network Code: test/benchmark.py
@@ -25,6 +25,20 @@ Network Code: test/benchmark.py
 | Forward | 43ms     | 21.7ms/13.7ms    | 23.5ms/11.2ms      | 22ms/12.2ms      |
 | Backward | 80ms    | 41.9ms/25.2ms    | 51.0ms/13.8ms      | 41.1ms/12.2ms      |
+| F16 Forward | Native| Implicit Gemm | Implicit Gemm Split Mask  |
+| -------------- |:---------------------:|---------------------:| ---------------------:|
+| RTX 3080 Laptop 150W@1755MHz | 13.7ms     | 11.2ms    | 12.2ms      |
+| RTX A6000 | 19.1ms    |  11.7ms   | 14.0ms      |
+| TESLA V100 | 17.9ms    |  11.4ms   | 13.4ms      |
+| A100 | 23.8ms    |  12.4ms   | 15.1ms      |
+| F16 Backward | Native| Implicit Gemm | Implicit Gemm Split Mask  |
+| -------------- |:---------------------:|---------------------:| ---------------------:|
+| RTX 3080 Laptop 150W@1755MHz | 25.2ms     | 13.8ms    | 12.2ms      |
+| RTX A6000       | 28.1ms     | 9.2ms     | 8.9ms      |
+| TESLA V100 | 33.9ms    |  12.2ms   | 12.9ms      |
+| A100 | 37.6ms    |  12.2ms   | 13.9ms      |
 ### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU
 Network Code: test/benchmark.py

--- a/docs/PERFORMANCE_GUIDE.md
+++ b/docs/PERFORMANCE_GUIDE.md
@@ -26,30 +26,3 @@
 * spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible.
 See [benchmark](BENCHMARK.md) for more performance details of different algorithms.
-## Algorithm Overview
-### Native Explicit (deprecated and removed in spconv 2.x)
-native algorithm (explicit, no fused) is standard gather-gemm-scatter algorithm. Assume we compute 3x3 conv, We can split it to 9 of 1x1 conv which can be computed by matmul, then sum them to get final result.
-For sparse convolution, we also do split-gemm-sum to calculate conv, but we need to collect data first because it's sparse.
-### Native
-Fused version of above algorithm. 1.5x-2x faster than non-fused version.
-### Implicit Gemm
-```Native``` algorithm do minimal mma (matrix multiply add), but it need to serialize IO. The pipeline of ```Native``` is gather-gemm-scatter-gather-gemm-scatter-...
-```Implicit Gemm``` fuse all calculation to one kernel and perform overlapped gather-mma-scatter to save a lot of time. 
-![Image Overlapped Gemm](https://raw.githubusercontent.com/NVIDIA/cutlass/master/media/images/software-pipeline.png)
-In my test, ```Implicit Gemm``` is almost 2x faster than ```Native```.
-### Implicit Gemm Split Mask
-TODO
-In my test, ```Implicit Gemm Split Mask``` is slightly faster than ```Implicit Gemm```, but the indice generation is slower, so currently we use ```Implicit Gemm``` by default.
\ No newline at end of file
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -16,6 +16,47 @@
 # Usage
+## Short API description
+```Python
+import spconv.pytorch as spconv
+from spconv.pytorch import functional as Fsp
+from torch import nn
+from spconv.pytorch.utils import PointToVoxel
+from spconv.pytorch.hash import HashTable
+```
+| Layer APIs                         | Common Usage             |            Dense Version    |Note    |
+|----------------------------------- |:------------------------:|----------------------------:|----------------------------:| 
+| ```spconv.SparseConv3d```          | Downsample               | ```nn.Conv3d```             | Use ```indice_key``` to save data for inverse |
+| ```spconv.SubMConv3d```            | Convolution              | N/A                         | Use ```indice_key``` to save data for reuse |
+| ```spconv.SparseInverseConv3d```   | Upsample                 |  N/A                        | Use pre-saved ```indice_key``` to upsample |
+| ```spconv.SparseConvTranspose3d``` | Upsample (don't use this)|  ```nn.ConvTranspose3d```   | VERY SLOW and CAN'T RECOVER ORIGIN POINT CLOUD |
+| ```spconv.SparseMaxPool3d```       | Downsample               |  ```nn.MaxPool3d```         | Use ```indice_key``` to save data for inverse |
+| ```spconv.SparseSequential```       | Container               |  ```nn.Sequential```         | support layers above and ```nn.ReLU, nn.BatchNorm, ...```|
+| Functional APIs                    | Usage                    |
+|----------------------------------- |:------------------------:|
+| ```Fsp.sparse_add```               | Add sparse tensors with same shape and different indices    |
+| Input APIs                         | Usage                    |
+|----------------------------------- |:------------------------:|
+| ```PointToVoxel```                 | point cloud to voxels    |
+| Misc APIs                         | Usage                    |
+|----------------------------------- |:------------------------:|
+| ```HashTable```                   | hash table, one-slot     |
+| Layer APIs                         | [torchsparse](https://github.com/mit-han-lab/torchsparse)             |    [MinkowskiEngine](https://github.com/NVIDIA/MinkowskiEngine)             |   
+|----------------------------------- |:------------------------:|:------------------------:|
+| ```spconv.SparseConv3d```          | ```Conv3d(stride!=1, transpose=False)```               |```MinkowskiConvolution(stride!=1)```| 
+| ```spconv.SubMConv3d```            | ```Conv3d(stride=1, transpose=False)```              | ```MinkowskiConvolution(stride=1)```| 
+| ```spconv.SparseInverseConv3d```   | ```Conv3d(stride!=1, transpose=True)```                 |```MinkowskiConvolutionTranspose```| 
+| ```spconv.SparseConvTranspose3d``` | N/A                |```MinkowskiConvolutionTranspose```| 
+| ```spconv.SparseMaxPool3d```       | N/A               | ```MinkowskiMaxPooling```| 
 ## Concept
 * Sparse Conv Tensor: like hybird [torch.sparse_coo_tensor](https://pytorch.org/docs/stable/sparse.html#sparse-coo-docs) but only have two difference: 1. SparseConvTensor only have one dense dim, 2. indice of SparseConvTensor is transposed. see torch doc for more details.
@@ -102,6 +143,29 @@ class ExampleNet(nn.Module):
        return self.net(x)
 ```
+### Sparse Add
+In sematic segmentation network, we may use conv1x3, 3x1 and 3x3 in a block, but it's impossible to sum result from these layers because regular add requires same indices.
+spconv >= 2.1.17 provide a operation to add sparse tensors with different indices (shape must same), but with limits:
+```Python
+from spconv.pytorch import functional as Fsp
+res_1x3 = conv1x3(x)
+res_3x1 = conv3x1(x)
+# WRONG
+# because we can't "inverse" this operation
+wrong_usage_cant_inverse = Fsp.sparse_add(res_1x3, res_3x1)
+# CORRECT
+# res_3x3 already contains all indices of res_1x3 and res_3x1, 
+# so output spatial structure isn't changed, we can "inverse" back.
+res_3x3 = conv3x3(x)
+correct = Fsp.sparse_add(res_1x3, res_3x1, res_3x3)
+```
+If you use a network without ```SparseInverseConv```, limits above aren't exists, the only drawback of ```sparse_add``` is that it run slower than simple aligned add.
 ### Fast Mixed Percision Training
 see example/mnist_sparse. we support ```torch.cuda.amp```.

--- a/example/simple_hash.py
+++ b/example/simple_hash.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch 
+from spconv.pytorch.hash import HashTable
+def main():
+    """Fixed-Size CUDA Hash Table:
+    this hash table can't delete keys after insert, and can't resize.
+    You need to pre-define a fixed-length of hash table, recommend 2x size
+    of your key num.
+    """
+    is_cpus = [True, False]
+    max_size = 1000
+    k_dtype = torch.int32 
+    v_dtype = torch.int64
+    for is_cpu in is_cpus:
+        if is_cpu:
+            dev = torch.device("cpu")
+            table = HashTable(dev, k_dtype, v_dtype)
+        else:
+            dev = torch.device("cuda:0")
+            table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
+        keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
+        values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
+        keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
+        table.insert(keys, values)
+        vq, _ = table.query(keys_query)
+        print(vq)
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+        table.assign_arange_()
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+        print("----------Insert Exist Keys----------")
+        is_empty = table.insert_exist_keys(keys, values)
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -156,6 +156,9 @@ if disable_jit is not None and disable_jit == "1":
    from cumm.conv.main import ConvMainUnitTest
    from cumm.constants import CUMM_CPU_ONLY_BUILD
    from spconv.csrc.sparse.all import SpconvOps
+    from spconv.csrc.utils import BoxOps
+    from spconv.csrc.hash.core import HashTable
    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
    convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
    convcu.namespace = "cumm.conv.main"
@@ -168,9 +171,9 @@ if disable_jit is not None and disable_jit == "1":
            std = "c++14" 
        else:
            std = "c++17"
-    cus = [cu, convcu, SpconvOps()]
+    cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable()]
    if CUMM_CPU_ONLY_BUILD:
-        cus = [SpconvOps()]
+        cus = [SpconvOps(), BoxOps(), HashTable()]
    ext_modules: List[Extension] = [
        PCCMExtension(cus,
                      "spconv/core_cc",

--- a/spconv/build.py
+++ b/spconv/build.py
@@ -28,6 +28,9 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    from cumm.conv.main import ConvMainUnitTest
    from spconv.csrc.sparse.all import SpconvOps
+    from spconv.csrc.utils import BoxOps
+    from spconv.csrc.hash.core import HashTable
    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS +
                          SHUFFLE_TURING_PARAMS)
    cu.namespace = "cumm.gemm.main"
@@ -38,7 +41,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    if InWindows:
        # windows have command line limit, so we use objects_folder to reduce command size.
        objects_folder = "objects"
-    pccm.builder.build_pybind([cu, convcu, SpconvOps()],
+    pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable()],
                              PACKAGE_ROOT / "core_cc",
                              namespace_root=PACKAGE_ROOT,
                              objects_folder=objects_folder,

--- a/spconv/core_cc/csrc/hash/__init__.pyi
+++ b/spconv/core_cc/csrc/hash/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/csrc/hash/core.pyi
+++ b/spconv/core_cc/csrc/hash/core.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class HashTable:
+    key_itemsize: int
+    value_itemsize: int
+    is_cpu: bool
+    insert_count: int
+    def __init__(self, is_cpu: bool, key_itemsize: int, value_itemsize: int, keys_data: Tensor, values_data: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            is_cpu: 
+            key_itemsize: 
+            value_itemsize: 
+            keys_data: 
+            values_data: 
+            stream: 
+        """
+        ...
+    def clear(self, stream: int = 0) -> None: 
+        """
+        in this function, if values is empty, it will be assigned to zero.
+        Args:
+            stream: 
+        """
+        ...
+    def insert(self, keys: Tensor, values: Tensor =  Tensor(), stream: int = 0) -> None: 
+        """
+        in this function, if values is empty, it will be assigned to zero.
+        Args:
+            keys: 
+            values: 
+            stream: 
+        """
+        ...
+    def query(self, keys: Tensor, values: Tensor, is_empty: Tensor, stream: int) -> None: 
+        """
+        query keys, save to values, and save is_empty to is_empty
+        Args:
+            keys: 
+            values: 
+            is_empty: 
+            stream: 
+        """
+        ...
+    def assign_arange_(self, count: Tensor, stream: int = 0) -> None: 
+        """
+        this function assign "arange(NumItem)" to table values.
+        useful in "unique-like" operations.
+        unlike insert/query, this method only support i32/i64/u32/u64 for value.
+        count must be u32/u64.
+        Args:
+            count: 
+            stream: 
+        """
+        ...
+    def size_cpu(self) -> int: 
+        """
+        this function can only be used to get cpu hash table size.
+        """
+        ...
+    def items(self, keys: Tensor, values: Tensor, count: Tensor, stream: int) -> None: 
+        """
+        get items.
+        Args:
+            keys: 
+            values: 
+            count: 
+            stream: 
+        """
+        ...
+    def insert_exist_keys(self, keys: Tensor, values: Tensor, is_empty: Tensor, stream: int) -> None: 
+        """
+        insert v of given k if k exists. won't insert any new key.
+        Args:
+            keys: 
+            values: 
+            is_empty: 
+            stream: 
+        """
+        ...
--- a/spconv/core_cc/csrc/utils/__init__.pyi
+++ b/spconv/core_cc/csrc/utils/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/core_cc/csrc/utils/boxops.pyi
+++ b/spconv/core_cc/csrc/utils/boxops.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class BoxOps:
+    @staticmethod
+    def has_boost() -> bool: ...
+    @staticmethod
+    def non_max_suppression_cpu(boxes: Tensor, order: Tensor, thresh: float, eps: float = 0) -> List[int]: 
+        """
+        Args:
+            boxes: 
+            order: 
+            thresh: 
+            eps: 
+        """
+        ...
--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
@@ -23,3 +23,6 @@ from spconv.core_cc.csrc.sparse.all import SpconvOps
 BUILD_CUMM_VERSION = SpconvOps.cumm_version()
 BUILD_PCCM_VERSION = SpconvOps.pccm_version()
+from spconv.core_cc.csrc.utils.boxops import BoxOps
+HAS_BOOST = BoxOps.has_boost()
--- a/spconv/csrc/hash/__init__.py
+++ b/spconv/csrc/hash/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/spconv/csrc/hash/core.py
+++ b/spconv/csrc/hash/core.py
--- a/spconv/csrc/utils/__init__.py
+++ b/spconv/csrc/utils/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .boxops import BoxOps
\ No newline at end of file
--- a/spconv/csrc/utils/boxops.py
+++ b/spconv/csrc/utils/boxops.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pccm 
+from pathlib import Path 
+import os 
+from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
+from spconv.constants import BOOST_ROOT
+class BoostGeometryLib(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        assert BOOST_ROOT is not None 
+        self.build_meta.add_includes(BOOST_ROOT)
+        self.add_include("boost/geometry.hpp")
+class BoxOps(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView)
+    @pccm.pybind.mark
+    @pccm.static_function
+    def has_boost(self):
+        code = pccm.FunctionCode()
+        code.raw(f"return {pccm.boolean(BOOST_ROOT is not None)};")
+        return code.ret("bool")
+    @pccm.pybind.mark(nogil=True)
+    @pccm.static_function
+    def non_max_suppression_cpu(self):
+        code = pccm.FunctionCode()
+        code.arg("boxes, order", "tv::Tensor")
+        code.arg("thresh", "float")
+        code.arg("eps", "float", "0")
+        code.raw(f"""
+        auto ndets = boxes.dim(0);
+        std::vector<int> keep(ndets);
+        tv::dispatch<float, double>(boxes.dtype(), [&](auto I1){{
+            using DType = TV_DECLTYPE(I1);
+            auto boxes_r = boxes.tview<const DType, 2>();
+            tv::dispatch<int, int64_t, uint32_t, uint64_t>(order.dtype(), [&](auto I2){{
+                using T2 = TV_DECLTYPE(I2);
+                auto order_r = order.tview<const T2, 1>();
+                std::vector<DType> areas;
+                for (int i = 0; i < ndets; ++i){{
+                    areas[i] = (boxes_r(i, 2) - boxes_r(i, 0) + eps) * 
+                               (boxes_r(i, 3) - boxes_r(i, 1) + eps);
+                }}
+                std::vector<int> suppressed(ndets, 0);
+                int i, j;
+                DType xx1, xx2, w, h, inter, ovr;
+                for (int _i = 0; _i < ndets; ++_i) {{
+                    i = order_r(_i);
+                    if (suppressed[i] == 1)
+                        continue;
+                    keep.push_back(i);
+                    for (int _j = _i + 1; _j < ndets; ++_j) {{
+                        j = order_r(_j);
+                        if (suppressed[j] == 1)
+                            continue;
+                        xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
+                        xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
+                        w = xx2 - xx1 + eps;
+                        if (w > 0) {{
+                            xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
+                            xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
+                            h = xx2 - xx1 + eps;
+                            if (h > 0) {{
+                            inter = w * h;
+                            ovr = inter / (areas[i] + areas[j] - inter);
+                            if (ovr >= thresh)
+                                suppressed[j] = 1;
+                            }}
+                        }}
+                    }}
+                }}
+            }});
+        }});
+        return keep;
+        """)
+        return code.ret("std::vector<int>")
+    @pccm.pybind.mark(nogil=True)
+    @pccm.static_function
+    def rotate_non_max_suppression_cpu(self):
+        code = pccm.FunctionCode()
+        code.arg("box_corners, order, standup_iou", "tv::Tensor")
+        code.arg("thresh", "float")
+        code.arg("eps", "float", "0")
+        if BOOST_ROOT is None:
+            return code.make_invalid()
+        code.add_dependency(BoostGeometryLib)
+        code.raw(f"""
+        auto ndets = box_corners.dim(0);
+        std::vector<int> keep(ndets);
+        tv::dispatch<float, double>(box_corners.dtype(), [&](auto I1){{
+            using DType = TV_DECLTYPE(I1);
+            auto box_corners_r = box_corners.tview<const DType, 3>();
+            auto standup_iou_r = standup_iou.tview<const DType, 2>();
+            tv::dispatch<int, int64_t, uint32_t, uint64_t>(order.dtype(), [&](auto I2){{
+                using T2 = TV_DECLTYPE(I2);
+                auto order_r = order.tview<const T2, 1>();
+                std::vector<int> suppressed(ndets, 0);
+                int i, j;
+                namespace bg = boost::geometry;
+                typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
+                typedef bg::model::polygon<point_t> polygon_t;
+                polygon_t poly, qpoly;
+                std::vector<polygon_t> poly_inter, poly_union;
+                DType inter_area, union_area, overlap;
+                for (int _i = 0; _i < ndets; ++_i) {{
+                    i = order_r(_i);
+                    if (suppressed[i] == 1)
+                    continue;
+                    keep.push_back(i);
+                    for (int _j = _i + 1; _j < ndets; ++_j) {{
+                        j = order_r(_j);
+                        if (suppressed[j] == 1)
+                            continue;
+                        if (standup_iou_r(i, j) <= 0.0)
+                            continue;
+                        // std::cout << "pre_poly" << std::endl;
+                        bg::append(poly,
+                                point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
+                        bg::append(poly,
+                                point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
+                        bg::append(poly,
+                                point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
+                        bg::append(poly,
+                                point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
+                        bg::append(poly,
+                                point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
+                        bg::append(qpoly,
+                                point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
+                        bg::append(qpoly,
+                                point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
+                        bg::append(qpoly,
+                                point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
+                        bg::append(qpoly,
+                                point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
+                        bg::append(qpoly,
+                                point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
+                        bg::intersection(poly, qpoly, poly_inter);
+                        if (!poly_inter.empty()) {{
+                            inter_area = bg::area(poly_inter.front());
+                            bg::union_(poly, qpoly, poly_union);
+                            if (!poly_union.empty()) {{ // ignore invalid box
+                                union_area = bg::area(poly_union.front());
+                                overlap = inter_area / union_area;
+                                if (overlap >= thresh)
+                                    suppressed[j] = 1;
+                                poly_union.clear();
+                            }}
+                        }}
+                        poly.clear();
+                        qpoly.clear();
+                        poly_inter.clear();
+                    }}
+                }}
+            }});
+        }});
+        return keep;
+        """)
+        return code.ret("std::vector<int>")
+    @pccm.pybind.mark(nogil=True)
+    @pccm.static_function
+    def rbbox_iou(self):
+        code = pccm.FunctionCode()
+        code.arg("box_corners, qbox_corners, standup_iou, overlaps", "tv::Tensor")
+        code.arg("standup_thresh", "float")
+        code.arg("inter_only", "bool")
+        if BOOST_ROOT is None:
+            return code.make_invalid()
+        code.add_dependency(BoostGeometryLib)
+        code.raw(f"""
+        auto N = box_corners.dim(0);
+        auto K = qbox_corners.dim(0);
+        if (N == 0 || K == 0) {{
+            return;
+        }}
+        tv::dispatch<float, double>(box_corners.dtype(), [&](auto I1){{
+            using DType = TV_DECLTYPE(I1);
+            auto box_corners_r = box_corners.tview<const DType, 3>();
+            auto qbox_corners_r = qbox_corners.tview<const DType, 3>();
+            auto standup_iou_r = standup_iou.tview<const DType, 2>();
+            auto overlaps_rw = overlaps.tview<DType, 2>();
+            namespace bg = boost::geometry;
+            typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
+            typedef bg::model::polygon<point_t> polygon_t;
+            polygon_t poly, qpoly;
+            std::vector<polygon_t> poly_inter, poly_union;
+            DType inter_area, union_area;
+            for (int k = 0; k < K; ++k) {{
+                for (int n = 0; n < N; ++n) {{
+                    if (standup_iou_r(n, k) <= standup_thresh)
+                        continue;
+                    bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+                    bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
+                    bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
+                    bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
+                    bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+                    bg::append(qpoly,
+                                point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
+                    bg::append(qpoly,
+                                point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
+                    bg::append(qpoly,
+                                point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
+                    bg::append(qpoly,
+                                point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
+                    bg::append(qpoly,
+                                point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
+                    bg::intersection(poly, qpoly, poly_inter);
+                    if (!poly_inter.empty()) {{
+                        inter_area = bg::area(poly_inter.front());
+                        if (inter_only){{
+                            overlaps_rw(n, k) = inter_area;
+                        }}else{{
+                            bg::union_(poly, qpoly, poly_union);
+                            if (!poly_union.empty()) {{
+                                union_area = bg::area(poly_union.front());
+                                overlaps_rw(n, k) = inter_area / union_area;
+                            }}
+                            poly_union.clear();
+                        }}
+                    }}
+                    poly.clear();
+                    qpoly.clear();
+                    poly_inter.clear();
+                }}
+            }}
+        }});
+        return;
+        """)
+        return code
+    @pccm.pybind.mark(nogil=True)
+    @pccm.static_function
+    def rbbox_iou_aligned(self):
+        code = pccm.FunctionCode()
+        code.arg("box_corners, qbox_corners, overlaps", "tv::Tensor")
+        code.arg("inter_only", "bool")
+        if BOOST_ROOT is None:
+            return code.make_invalid()
+        code.add_dependency(BoostGeometryLib)
+        code.raw(f"""
+        auto N = box_corners.dim(0);
+        auto K = qbox_corners.dim(0);
+        TV_ASSERT_RT_ERR(N == K, "aligned iou must have same number of box")
+        if (N == 0 || K == 0) {{
+            return;
+        }}
+        tv::dispatch<float, double>(box_corners.dtype(), [&](auto I1){{
+            using DType = TV_DECLTYPE(I1);
+            auto box_corners_r = box_corners.tview<const DType, 3>();
+            auto qbox_corners_r = qbox_corners.tview<const DType, 3>();
+            auto overlaps_rw = overlaps.tview<DType, 1>();
+            namespace bg = boost::geometry;
+            typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
+            typedef bg::model::polygon<point_t> polygon_t;
+            polygon_t poly, qpoly;
+            std::vector<polygon_t> poly_inter, poly_union;
+            DType inter_area, union_area;
+            for (int n = 0; n < N; ++n) {{
+                bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+                bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
+                bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
+                bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
+                bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+                bg::append(qpoly,
+                            point_t(qbox_corners_r(n, 0, 0), qbox_corners_r(n, 0, 1)));
+                bg::append(qpoly,
+                            point_t(qbox_corners_r(n, 1, 0), qbox_corners_r(n, 1, 1)));
+                bg::append(qpoly,
+                            point_t(qbox_corners_r(n, 2, 0), qbox_corners_r(n, 2, 1)));
+                bg::append(qpoly,
+                            point_t(qbox_corners_r(n, 3, 0), qbox_corners_r(n, 3, 1)));
+                bg::append(qpoly,
+                            point_t(qbox_corners_r(n, 0, 0), qbox_corners_r(n, 0, 1)));
+                bg::intersection(poly, qpoly, poly_inter);
+                if (!poly_inter.empty()) {{
+                    inter_area = bg::area(poly_inter.front());
+                    if (inter_only){{
+                        overlaps_rw(n) = inter_area;
+                    }}else{{
+                        bg::union_(poly, qpoly, poly_union);
+                        if (!poly_union.empty()) {{
+                            union_area = bg::area(poly_union.front());
+                            overlaps_rw(n) = inter_area / union_area;
+                        }}
+                        poly_union.clear();
+                    }}
+                }}
+                poly.clear();
+                qpoly.clear();
+                poly_inter.clear();
+            }}
+        }});
+        return;
+        """)
+        return code
--- a/spconv/debug_utils.py
+++ b/spconv/debug_utils.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pickle 
+from pathlib import Path 
+from spconv.constants import SPCONV_DEBUG_SAVE_PATH
+def spconv_save_debug_data(data):
+    if SPCONV_DEBUG_SAVE_PATH:
+        try:
+            save_path = Path(SPCONV_DEBUG_SAVE_PATH)
+            assert save_path.parent.exists(), "parent of SPCONV_DEBUG_SAVE_PATH must exist"
+            with save_path.open("wb") as f:
+                pickle.dump(data, f)
+            print((f"spconv save debug data to {SPCONV_DEBUG_SAVE_PATH}, "
+                    "you can submit issue with log and this debug data attached."))
+        except Exception as e:
+            print((f"spconv try to save debug data to {SPCONV_DEBUG_SAVE_PATH}, "
+                    f"but failed with exception {e}. please check your SPCONV_DEBUG_SAVE_PATH"))
+    else:
+        print((f"SPCONV_DEBUG_SAVE_PATH not found, "
+                "you can specify SPCONV_DEBUG_SAVE_PATH as debug data save path "
+                "to save debug data which can be attached in a issue."))
--- a/spconv/pytorch/__init__.py
+++ b/spconv/pytorch/__init__.py
@@ -3,6 +3,7 @@ from pathlib import Path
 import numpy as np
 import torch
+from spconv.pytorch.core import SparseConvTensor
 from spconv.pytorch import functional, ops
 from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
                                 SparseConv4d, SparseConvTranspose1d,
@@ -11,7 +12,6 @@ from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
                                 SparseInverseConv2d, SparseInverseConv3d,
                                 SparseInverseConv4d, SubMConv1d, SubMConv2d,
                                 SubMConv3d, SubMConv4d)
-from spconv.pytorch.core import SparseConvTensor
 from spconv.pytorch.identity import Identity
 from spconv.pytorch.modules import (SparseModule, SparseSequential,
                                    assign_name_for_sparse_modules)