v2.1.14: add hash table, fix small bug

076bdb05 · yan.yan · d406d9e2 · 076bdb05 · 076bdb05 · 076bdb05
Commit 076bdb05 authored Nov 28, 2021 by yan.yan
20 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -30,14 +30,14 @@ jobs:
              - 'spconv/algo.py'
              - 'spconv/core.py'
              - 'pyproject.toml'
-      - name: Install CUDA
+      - name: Install Boost
        env:
          CUDA_VERSION: ${{ matrix.cuda-version }}
          PYTHON_VERSION: ${{ matrix.python-version }}
          cuda: ${{ matrix.cuda-version }}
          BOOST_VERSION: boost_1_77_0
        if: |
-          (env.CUDA_VERSION != '') && (
+          (
            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
            (
              (steps.changes.outputs.needbuild == 'true') && 
@@ -49,7 +49,24 @@ jobs:
          $ProgressPreference = 'SilentlyContinue'
          Invoke-WebRequest -Uri "https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.zip" -UseBasicParsing -OutFile $HOME/boost.zip
          Expand-Archive $HOME/boost.zip -DestinationPath $HOME/boost
+
+      - name: Install CUDA
+        env:
+          CUDA_VERSION: ${{ matrix.cuda-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+          cuda: ${{ matrix.cuda-version }}
+        if: |
+          (env.CUDA_VERSION != '') && (
+            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
+            (
+              (steps.changes.outputs.needbuild == 'true') && 
+              (env.PYTHON_VERSION == '3.10')
+            )
+          )
+        shell: powershell
+        run: |
          .\tools\install_windows_cuda.ps1
+        
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
@@ -66,6 +83,7 @@ jobs:
          CUDA_VERSION: ${{ matrix.cuda-version }}
          PYTHON_VERSION: ${{ matrix.python-version }}
          BOOST_VERSION: boost_1_77_0
+          CUMM_CUDA_VERSION: ${{ matrix.cuda-version }}
        if: |
          (env.CUDA_VERSION != '') && (
            (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || 
@@ -75,7 +93,6 @@ jobs:
            )
          )
        run: |
-          $Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
          $Env:CUMM_CUDA_ARCH_LIST = "all"
          $Env:SPCONV_DISABLE_JIT = "1"
          pip install pccm pybind11

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog

+## [2.1.14] - 2021-11-28
+### Added 
+- Add hash table
+- update cumm version
+- Add AddTableMisaligned for sptensors with same shape but different indices.
+### Fixed
+- Fix a bug already fixed in 2.1.10 but introduced in 2.1.12 again.
+
 ## [2.1.13] - 2021-?-?
 ### Added 
 - Add some ops from spconv 1.x, see spconv.utils for more details.

--- a/README.md
+++ b/README.md
@@ -171,7 +171,9 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that
 5. run ```pip install pccm cumm wheel```
 6. run ```python setup.py bdist_wheel```+```pip install dists/xxx.whl```

+## Know issues

+* Spconv 2.x F16 runs slow in A100. 

 ## Note


--- a/docs/BENCHMARK.md
+++ b/docs/BENCHMARK.md
@@ -27,15 +27,17 @@ Network Code: test/benchmark.py

 | F16 Forward | Native| Implicit Gemm | Implicit Gemm Split Mask  |
 | -------------- |:---------------------:|---------------------:| ---------------------:|
-| RTX 3080 Laptop 150W | 13.7ms     | 11.2ms    | 12.2ms      |
+| RTX 3080 Laptop 150W@1755MHz | 13.7ms     | 11.2ms    | 12.2ms      |
 | RTX A6000 | 19.1ms    |  11.7ms   | 14.0ms      |
 | TESLA V100 | 17.9ms    |  11.4ms   | 13.4ms      |
+| A100 | 23.8ms    |  12.4ms   | 15.1ms      |

 | F16 Backward | Native| Implicit Gemm | Implicit Gemm Split Mask  |
 | -------------- |:---------------------:|---------------------:| ---------------------:|
-| RTX 3080 Laptop 150W | 25.2ms     | 13.8ms    | 12.2ms      |
+| RTX 3080 Laptop 150W@1755MHz | 25.2ms     | 13.8ms    | 12.2ms      |
 | RTX A6000       | 28.1ms     | 9.2ms     | 8.9ms      |
 | TESLA V100 | 33.9ms    |  12.2ms   | 12.9ms      |
+| A100 | 37.6ms    |  12.2ms   | 13.9ms      |

 ### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU


--- a/example/simple_hash.py
+++ b/example/simple_hash.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch 
+from spconv.pytorch.hash import HashTable
+
+
+def main():
+    """Fixed-Size CUDA Hash Table:
+    this hash table can't delete keys after insert, and can't resize.
+    You need to pre-define a fixed-length of hash table, recommend 2x size
+    of your key num.
+
+    """
+    is_cpus = [True, False]
+    max_size = 1000
+    k_dtype = torch.int32 
+    v_dtype = torch.int64
+    for is_cpu in is_cpus:
+        if is_cpu:
+            dev = torch.device("cpu")
+            table = HashTable(dev, k_dtype, v_dtype)
+        else:
+            dev = torch.device("cuda:0")
+            table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
+
+        keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
+        values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
+        keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
+
+        table.insert(keys, values)
+
+        vq, _ = table.query(keys_query)
+        print(vq)
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+
+        table.assign_arange_()
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/spconv/build.py
+++ b/spconv/build.py
@@ -29,6 +29,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(

    from spconv.csrc.sparse.all import SpconvOps
    from spconv.csrc.utils import BoxOps
+    from spconv.csrc.hash.core import HashTable

    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS +
                          SHUFFLE_TURING_PARAMS)
@@ -40,7 +41,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
    if InWindows:
        # windows have command line limit, so we use objects_folder to reduce command size.
        objects_folder = "objects"
-    pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps()],
+    pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable()],
                              PACKAGE_ROOT / "core_cc",
                              namespace_root=PACKAGE_ROOT,
                              objects_folder=objects_folder,

--- a/spconv/core_cc/csrc/hash/__init__.pyi
+++ b/spconv/core_cc/csrc/hash/__init__.pyi
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/core_cc/csrc/hash/core.pyi
+++ b/spconv/core_cc/csrc/hash/core.pyi
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class HashTable:
+    key_itemsize: int
+    value_itemsize: int
+    is_cpu: bool
+    insert_count: int
+    def __init__(self, is_cpu: bool, key_itemsize: int, value_itemsize: int, keys_data: Tensor, values_data: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            is_cpu: 
+            key_itemsize: 
+            value_itemsize: 
+            keys_data: 
+            values_data: 
+            stream: 
+        """
+        ...
+    def clear(self, stream: int = 0) -> None: 
+        """
+        in this function, if values is empty, it will be assigned to zero.
+                
+        Args:
+            stream: 
+        """
+        ...
+    def insert(self, keys: Tensor, values: Tensor =  Tensor(), stream: int = 0) -> None: 
+        """
+        in this function, if values is empty, it will be assigned to zero.
+                
+        Args:
+            keys: 
+            values: 
+            stream: 
+        """
+        ...
+    def query(self, keys: Tensor, values: Tensor, is_empty: Tensor, stream: int) -> None: 
+        """
+        query keys, save to values, and save is_empty to is_empty
+                
+        Args:
+            keys: 
+            values: 
+            is_empty: 
+            stream: 
+        """
+        ...
+    def assign_arange_(self, count: Tensor, stream: int = 0) -> None: 
+        """
+        this function assign "arange(NumItem)" to table values.
+        useful in "unique-like" operations.
+        unlike insert/query, this method only support i32/i64/u32/u64 for value.
+        count must be u32/u64.
+        Args:
+            count: 
+            stream: 
+        """
+        ...
+    def size_cpu(self) -> int: 
+        """
+        this function can only be used to get cpu hash table size.
+                
+        """
+        ...
+    def items(self, keys: Tensor, values: Tensor, count: Tensor, stream: int) -> None: 
+        """
+        get items.
+                
+        Args:
+            keys: 
+            values: 
+            count: 
+            stream: 
+        """
+        ...
--- a/spconv/core_cc/csrc/utils/boxops.pyi
+++ b/spconv/core_cc/csrc/utils/boxops.pyi
@@ -14,36 +14,3 @@ class BoxOps:
            eps: 
        """
        ...
-    @staticmethod
-    def rotate_non_max_suppression_cpu(box_corners: Tensor, order: Tensor, standup_iou: Tensor, thresh: float, eps: float = 0) -> List[int]: 
-        """
-        Args:
-            box_corners: 
-            order: 
-            standup_iou: 
-            thresh: 
-            eps: 
-        """
-        ...
-    @staticmethod
-    def rbbox_iou(box_corners: Tensor, qbox_corners: Tensor, standup_iou: Tensor, overlaps: Tensor, standup_thresh: float, inter_only: bool) -> None: 
-        """
-        Args:
-            box_corners: 
-            qbox_corners: 
-            standup_iou: 
-            overlaps: 
-            standup_thresh: 
-            inter_only: 
-        """
-        ...
-    @staticmethod
-    def rbbox_iou_aligned(box_corners: Tensor, qbox_corners: Tensor, overlaps: Tensor, inter_only: bool) -> None: 
-        """
-        Args:
-            box_corners: 
-            qbox_corners: 
-            overlaps: 
-            inter_only: 
-        """
-        ...
--- a/spconv/csrc/hash/__init__.py
+++ b/spconv/csrc/hash/__init__.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/spconv/csrc/hash/core.py
+++ b/spconv/csrc/hash/core.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from cumm.constants import CUMM_CPU_ONLY_BUILD
+
+import pccm
+from cumm.common import (TensorView, TensorViewCPU, TensorViewHashKernel,
+                         TensorViewKernel, TslRobinMap)
+from spconv.csrc.sparse.cpu_core import OMPLib
+
+class HashTable(pccm.Class, pccm.pybind.PybindClassMixin):
+    """a simple hashtable for both cpu and cuda.
+    CPU implementation don't support parallel.
+    both cpu and cuda only support 32/64bit key value.
+    """
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorView, TslRobinMap)
+        if CUMM_CPU_ONLY_BUILD:
+            self.add_dependency(OMPLib)
+        self.add_include("tensorview/parallel/all.h")
+        self.add_member("keys_data, values_data", "tv::Tensor")
+        self.add_pybind_member("key_itemsize_", "int", prop_name="key_itemsize", readwrite=False)
+        self.add_pybind_member("value_itemsize_", "int", prop_name="value_itemsize", readwrite=False)
+
+        self.add_pybind_member("is_cpu", "bool", readwrite=False)
+        self.add_member("map_4_4", "tsl::robin_map<uint32_t, uint32_t>")
+        self.add_member("map_4_8", "tsl::robin_map<uint32_t, uint64_t>")
+        self.add_member("map_8_4", "tsl::robin_map<uint64_t, uint32_t>")
+        self.add_member("map_8_8", "tsl::robin_map<uint64_t, uint64_t>")
+        self.add_pybind_member("insert_count_", "int64_t", prop_name="insert_count", readwrite=False)
+
+    @pccm.pybind.mark 
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("is_cpu", "bool")
+        code.arg("key_itemsize, value_itemsize", "int")
+        code.arg("keys_data", "tv::Tensor")
+        code.arg("values_data", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.ctor_init("is_cpu", "is_cpu")
+        code.ctor_init("keys_data", "keys_data")
+        code.ctor_init("values_data", "values_data")
+        code.ctor_init("key_itemsize_", "key_itemsize")
+        code.ctor_init("value_itemsize_", "value_itemsize")
+
+        code.ctor_init("insert_count_", "0")
+
+        code.raw(f"""
+        TV_ASSERT_RT_ERR(key_itemsize == 4 || key_itemsize == 8, "key_itemsize must be 4 or 8");
+        TV_ASSERT_RT_ERR(value_itemsize == 4 || value_itemsize == 8, "value_itemsize must be 4 or 8");
+
+        if (!is_cpu){{
+            TV_ASSERT_RT_ERR(!keys_data.empty() && !values_data.empty(), "key and value must not empty");
+            TV_ASSERT_RT_ERR(keys_data.dim(0) == values_data.dim(0), "key and value must have same size");
+            TV_ASSERT_RT_ERR(key_itemsize == keys_data.itemsize(), "key_itemsize must equal to key_data");
+            TV_ASSERT_RT_ERR(value_itemsize == values_data.itemsize(), "value_itemsize must equal to values_data");
+            // clear cuda table here.
+            clear(stream);
+        }}
+        """)
+        if CUMM_CPU_ONLY_BUILD:
+            code.raw(f"TV_ASSERT_RT_ERR(is_cpu, \"spconv not built with CUDA\");")
+        return code 
+
+    @pccm.pybind.mark 
+    @pccm.cuda.member_function
+    def clear(self):
+        """ in this function, if values is empty, it will be assigned to zero.
+        """
+        code = pccm.FunctionCode()
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_dependency(TensorViewHashKernel)
+        code.arg("stream", "std::uintptr_t", "0")
+        with code.if_("is_cpu"):
+            code.raw(f"""
+            if (is_cpu){{
+                map_4_4.clear();
+                map_4_8.clear();
+                map_8_4.clear();
+                map_8_8.clear();
+                return;
+            }}
+            """)
+        if not CUMM_CPU_ONLY_BUILD:
+            with code.else_():
+                code.raw(f"""
+                auto custream = reinterpret_cast<cudaStream_t>(stream);
+                tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
+                    constexpr int IKV = TV_DECLTYPE(IK)::value;
+                    using K = tv::hash::itemsize_to_unsigned_t<IKV>;
+                    constexpr K kEmptyKey = std::numeric_limits<K>::max();
+                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
+                    tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
+                        constexpr int IVV = TV_DECLTYPE(IV)::value;
+                        using V = tv::hash::itemsize_to_unsigned_t<IVV>;
+                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
+                        using table_t =
+                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
+                                                        kEmptyKey, false>;
+                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
+                        tv::cuda::Launch launcher(table.size(), custream);
+                        launcher(tv::hash::clear_table_split<table_t>, table);
+                    }});
+                }});
+                """)
+        return code 
+
+
+    @pccm.pybind.mark 
+    @pccm.cuda.member_function
+    def insert(self):
+        """ in this function, if values is empty, it will be assigned to zero.
+        """
+        code = pccm.FunctionCode()
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_dependency(TensorViewHashKernel)
+        code.arg("keys", "tv::Tensor")
+        code.arg("values", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        if (!is_cpu){{
+            int64_t value_after_insert = keys.dim(0) + insert_count_;
+            TV_ASSERT_RT_ERR(value_after_insert < keys_data.dim(0), "inserted count exceed maximum hash size");
+            insert_count_ += keys.dim(0);
+        }}
+        auto N = keys.dim(0);
+        TV_ASSERT_RT_ERR(keys.itemsize() == key_itemsize_, "keys itemsize not equal to", key_itemsize_);
+        if (!values.empty()){{
+            TV_ASSERT_RT_ERR(values.itemsize() == value_itemsize_, "values itemsize not equal to", value_itemsize_);
+            TV_ASSERT_RT_ERR(keys.dim(0) == values.dim(0), "number of key and value must same");
+        }}
+        """)
+        with code.if_("is_cpu"):
+            map_name = "cpu_map"
+            for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
+                code.raw(f"""
+                auto k_ptr = reinterpret_cast<const {k_type}*>(keys.raw_data());
+                if (values.empty()){{
+                    for (size_t i = 0; i < N; ++i){{
+                        {map_name}.insert({{k_ptr[i], {v_type}(0)}});
+                    }}
+                }}
+                else{{
+                    auto v_ptr = reinterpret_cast<const {v_type}*>(values.raw_data());
+                    for (size_t i = 0; i < N; ++i){{
+                        {map_name}.insert({{k_ptr[i], v_ptr[i]}});
+                    }}
+                }}
+                """)
+        if not CUMM_CPU_ONLY_BUILD:
+            with code.else_():
+                code.raw(f"""
+                auto custream = reinterpret_cast<cudaStream_t>(stream);
+                tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
+                    constexpr int IKV = TV_DECLTYPE(IK)::value;
+                    using K = tv::hash::itemsize_to_unsigned_t<IKV>;
+                    constexpr K kEmptyKey = std::numeric_limits<K>::max();
+                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
+                    const K* key_ptr = reinterpret_cast<const K*>(keys.raw_data());
+                    tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
+                        constexpr int IVV = TV_DECLTYPE(IV)::value;
+                        using V = tv::hash::itemsize_to_unsigned_t<IVV>;
+                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
+                        const V* value_ptr = reinterpret_cast<const V*>(values.raw_data());
+                        using table_t =
+                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
+                                                        kEmptyKey, false>;
+                        tv::cuda::Launch launcher(N, custream);
+                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
+                        launcher(tv::hash::insert_split<table_t>, table, key_ptr, value_ptr, size_t(N));
+                    }});
+                }});
+                """)
+        else:
+            code.raw(f"""
+            TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
+            """)
+        return code 
+
+    @pccm.pybind.mark 
+    @pccm.cuda.member_function
+    def query(self):
+        """query keys, save to values, and save is_empty to is_empty
+        """
+        code = pccm.FunctionCode()
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_dependency(TensorViewHashKernel)
+        code.arg("keys", "tv::Tensor")
+        code.arg("values", "tv::Tensor")
+        code.arg("is_empty", "tv::Tensor")
+
+        code.arg("stream", "std::uintptr_t")
+
+        code.raw(f"""
+        auto N = keys.dim(0);
+        TV_ASSERT_RT_ERR(keys.itemsize() == key_itemsize_, "keys itemsize not equal to", key_itemsize_);
+        TV_ASSERT_RT_ERR(values.itemsize() == value_itemsize_, "values itemsize not equal to", value_itemsize_);
+        TV_ASSERT_RT_ERR(N == values.dim(0) && is_empty.dim(0) == N, "number of key and value must same");
+        auto is_empty_ptr = is_empty.data_ptr<uint8_t>();
+        """)
+        with code.if_("is_cpu"):
+            map_name = "cpu_map"
+            # here it's safe to use omp in query.
+            for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
+                code.raw(f"""
+                auto k_ptr = reinterpret_cast<{k_type}*>(keys.raw_data());
+                auto v_ptr = reinterpret_cast<{v_type}*>(values.raw_data());
+                tv::kernel_1d_cpu(keys.device(), N, [&](size_t begin, size_t end, size_t step){{
+                    bool emp;
+                    for (size_t i = begin; i < end; i += step){{
+                        auto iter = {map_name}.find(k_ptr[i]);
+                        emp = iter == {map_name}.end();
+                        if (!emp){{
+                            v_ptr[i] = iter->second;
+                        }}
+                        is_empty_ptr[i] = uint8_t(emp);
+                    }}
+                }});
+                """)
+        if not CUMM_CPU_ONLY_BUILD:
+            with code.else_():
+                code.raw(f"""
+                auto custream = reinterpret_cast<cudaStream_t>(stream);
+                tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
+                    constexpr int IKV = TV_DECLTYPE(IK)::value;
+
+                    using K = tv::hash::itemsize_to_unsigned_t<IKV>;
+                    constexpr K kEmptyKey = std::numeric_limits<K>::max();
+                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
+                    K* key_ptr = reinterpret_cast<K*>(keys.raw_data());
+                    tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
+                        constexpr int IVV = TV_DECLTYPE(IV)::value;
+
+                        using V = tv::hash::itemsize_to_unsigned_t<IVV>;
+                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
+                        V* value_ptr = reinterpret_cast<V*>(values.raw_data());
+                        using table_t =
+                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
+                                                        kEmptyKey, false>;
+                        tv::cuda::Launch launcher(N, custream);
+                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
+                        launcher(tv::hash::query_split<table_t>, table, key_ptr, value_ptr, is_empty_ptr, size_t(N));
+                    }});
+                }});
+                """)
+        else:
+            code.raw(f"""
+            TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
+            """)
+        return code 
+
+    @pccm.pybind.mark 
+    @pccm.cuda.member_function
+    def assign_arange_(self):
+        """ this function assign "arange(NumItem)" to table values.
+        useful in "unique-like" operations.
+        unlike insert/query, this method only support i32/i64/u32/u64 for value.
+        count must be u32/u64.
+        """
+        code = pccm.FunctionCode()
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_dependency(TensorViewHashKernel)
+        code.arg("count", "tv::Tensor")
+
+        code.arg("stream", "std::uintptr_t", "0")
+        with code.if_("is_cpu"):
+            map_name = "cpu_map"
+            for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
+                code.raw(f"""
+                {v_type} index = 0;
+                for (auto it = {map_name}.begin(); it != {map_name}.end(); ++it){{
+                    it.value() = index;
+                    ++index;
+                }}
+
+                """)
+        if not CUMM_CPU_ONLY_BUILD:
+            with code.else_():
+                code.raw(f"""
+                TV_ASSERT_RT_ERR(count.device() == 0, "count must be cuda");
+                auto custream = reinterpret_cast<cudaStream_t>(stream);
+                tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
+                    constexpr int IKV = TV_DECLTYPE(IK)::value;
+                    using K = tv::hash::itemsize_to_unsigned_t<IKV>;
+                    constexpr K kEmptyKey = std::numeric_limits<K>::max();
+                    auto count_ptr = count.data_ptr<K>();
+
+                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
+                    tv::dispatch<int32_t, int64_t, uint32_t, uint64_t>(values_data.dtype(), [&](auto IV){{
+                        using V = TV_DECLTYPE(IV);
+                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
+                        using table_t =
+                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
+                                                        kEmptyKey, false>;
+                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
+                        tv::cuda::Launch launcher(table.size(), custream);
+                        launcher(tv::hash::assign_arange_split<table_t, K>, table, count_ptr);
+                    }});
+                }});
+                """)
+        else:
+            code.raw(f"""
+            TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
+            """)
+        return code 
+
+    @pccm.pybind.mark 
+    @pccm.cuda.member_function
+    def size_cpu(self):
+        """ this function can only be used to get cpu hash table size.
+        """
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        int64_t res = -1;
+        TV_ASSERT_RT_ERR(is_cpu, "size_cpu can only be used in cpu hash table");
+        """)
+        with code.if_("is_cpu"):
+            map_name = "cpu_map"
+            for _ in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
+                code.raw(f"""
+                res = {map_name}.size();
+                """)
+        code.raw(f"return res;")
+        return code.ret("int64_t")
+
+
+    @pccm.pybind.mark 
+    @pccm.cuda.member_function
+    def items(self):
+        """get items.
+        """
+        code = pccm.FunctionCode()
+        if not CUMM_CPU_ONLY_BUILD:
+            code.add_dependency(TensorViewHashKernel)
+        code.arg("keys", "tv::Tensor")
+        code.arg("values", "tv::Tensor")
+        code.arg("count", "tv::Tensor")
+
+        code.arg("stream", "std::uintptr_t")
+
+        code.raw(f"""
+        auto N = keys.dim(0);
+        TV_ASSERT_RT_ERR(keys.itemsize() == key_itemsize_, "keys itemsize not equal to", key_itemsize_);
+        TV_ASSERT_RT_ERR(values.itemsize() == value_itemsize_, "values itemsize not equal to", value_itemsize_);
+        TV_ASSERT_RT_ERR(N == values.dim(0), "number of key and value must same");
+        
+        """)
+        with code.if_("is_cpu"):
+            map_name = "cpu_map"
+            # here it's safe to use omp in query.
+            for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
+                code.raw(f"""
+                auto k_ptr = reinterpret_cast<{k_type}*>(keys.raw_data());
+                auto v_ptr = reinterpret_cast<{v_type}*>(values.raw_data());
+                {v_type} index = 0;
+                for (auto it = {map_name}.begin(); it != {map_name}.end(); ++it){{
+                    if (index >= N){{
+                        break;
+                    }}
+                    k_ptr[index] = it->first;
+                    v_ptr[index] = it->second;
+                    ++index;
+                }}
+                """)
+        if not CUMM_CPU_ONLY_BUILD:
+            with code.else_():
+                code.raw(f"""
+                auto custream = reinterpret_cast<cudaStream_t>(stream);
+                tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
+                    constexpr int IKV = TV_DECLTYPE(IK)::value;
+
+                    using K = tv::hash::itemsize_to_unsigned_t<IKV>;
+                    auto count_ptr = count.data_ptr<K>();
+
+                    constexpr K kEmptyKey = std::numeric_limits<K>::max();
+                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
+                    K* key_ptr = reinterpret_cast<K*>(keys.raw_data());
+                    tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
+                        constexpr int IVV = TV_DECLTYPE(IV)::value;
+
+                        using V = tv::hash::itemsize_to_unsigned_t<IVV>;
+
+                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
+                        V* value_ptr = reinterpret_cast<V*>(values.raw_data());
+                        using table_t =
+                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
+                                                        kEmptyKey, false>;
+                        tv::cuda::Launch launcher(N, custream);
+                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
+                        launcher(tv::hash::iterate_table_split<table_t, K>, table, key_ptr, value_ptr, size_t(N), count_ptr);
+                    }});
+                }});
+                """)
+        else:
+            code.raw(f"""
+            TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
+            """)
+        return code 
+
+
+    def cpu_map_storage_select(self, k_itemsize: str, v_itemsize: str, res_var: str, code: pccm.FunctionCode):
+        different_kvs = [(4, 4), (4, 8), (8, 4), (8, 8)]
+        item_size_to_dtype = {
+            4: "uint32_t",
+            8: "uint64_t",
+        }
+        with code.block(""):
+            code.raw("bool found = false;")
+            for kit, vit in different_kvs:
+                with code.if_(f"{k_itemsize} == {kit} && {v_itemsize} == {vit}"):
+                    code.raw(f"auto& {res_var} = map_{kit}_{vit};")
+                    yield item_size_to_dtype[kit], item_size_to_dtype[vit]
+                    code.raw(f"found = true;")
+            code.raw("TV_ASSERT_RT_ERR(found, \"suitable hash table not found.\");")
+        
+
--- a/spconv/pytorch/__init__.py
+++ b/spconv/pytorch/__init__.py
@@ -3,6 +3,7 @@ from pathlib import Path

 import numpy as np
 import torch
+from spconv.pytorch.core import SparseConvTensor
 from spconv.pytorch import functional, ops
 from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
                                 SparseConv4d, SparseConvTranspose1d,
@@ -11,7 +12,6 @@ from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
                                 SparseInverseConv2d, SparseInverseConv3d,
                                 SparseInverseConv4d, SubMConv1d, SubMConv2d,
                                 SubMConv3d, SubMConv4d)
-from spconv.pytorch.core import SparseConvTensor
 from spconv.pytorch.identity import Identity
 from spconv.pytorch.modules import (SparseModule, SparseSequential,
                                    assign_name_for_sparse_modules)

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -18,7 +18,6 @@ import numpy as np
 import torch
 from spconv.core import ConvAlgo
 from spconv.pytorch.constants import PYTORCH_VERSION
-from spconv.pytorch.ops import ThrustSortAllocator
 from spconv.tools import CUDAKernelTimer

 if PYTORCH_VERSION >= [1, 8, 0]:
@@ -39,6 +38,24 @@ else:
        pass


+class ThrustSortAllocator:
+    def __init__(self, device: torch.device) -> None:
+        super().__init__()
+        self.alloced_objs = {}
+
+        self.device = device
+
+    def alloc(self, n: int):
+        if n in self.alloced_objs:
+            return self.alloced_objs[n].data_ptr()
+        for n_cur, ten in self.alloced_objs.items():
+            if n < n_cur:
+                return ten.data_ptr()
+        ten = torch.empty([n], dtype=torch.uint8, device=self.device)
+        self.alloced_objs[n] = ten
+        return ten.data_ptr()
+
+
 class IndiceData(object):
    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
                 spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo):

--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
@@ -46,6 +46,9 @@ def torch_tensor_to_tv(ten: torch.Tensor,
        dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
    return tv.from_blob(ptr, shape, dtype, tv_device)

+def torch_tensors_to_tv(*tens: torch.Tensor):
+    return (torch_tensor_to_tv(t) for t in tens)
+

 def get_current_stream():
    return torch.cuda.current_stream().cuda_stream

--- a/spconv/pytorch/functional.py
+++ b/spconv/pytorch/functional.py
@@ -20,15 +20,18 @@ from torch import nn
 from torch.autograd import Function
 from typing import Optional, TypeVar
 from spconv.tools import CUDAKernelTimer
-from spconv.pytorch import ops
+from spconv.pytorch import ops, SparseConvTensor
 from spconv.pytorch.constants import PYTORCH_VERSION
 from spconv.debug_utils import spconv_save_debug_data
 from torch.autograd.function import once_differentiable
 import numpy as np
 from pathlib import Path
-
+from spconv.pytorch.hash import HashTable
+from cumm.gemm.layout import to_stride
 from typing import List

+_MAX_INT32 = 2147483647
+
 _T = TypeVar("_T")

 def identity_decorator(func: _T) -> _T:
@@ -357,3 +360,69 @@ indice_inverse_conv = SparseInverseConvFunction.apply
 indice_subm_conv = SubMConvFunction.apply
 indice_maxpool = SparseMaxPoolFunction.apply
 indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
+
+
+def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):
+    assert indices.shape[1] == len(shape)
+    stride = to_stride(np.array(shape, dtype=np.int64))
+    scalar_inds = indices[:, -1]
+    for i in range(len(shape) - 1):
+        scalar_inds += stride[i] * indices[:, i]
+    return scalar_inds.contiguous()
+
+def sparse_add_hash_based(*tens: SparseConvTensor):
+    table_size = 0
+    for ten in tens:
+        assert ten.spatial_shape == tens[0].spatial_shape
+        assert ten.batch_size == tens[0].batch_size
+        assert ten.features.shape[1] == tens[0].features.shape[1]
+        table_size += ten.features.shape[0]
+    first = tens[0]
+    feat = first.features
+    shape = [first.batch_size, *first.spatial_shape]
+    whole_shape = int(np.prod(shape))
+    table_size *= 2
+    k_type = torch.int32
+    if whole_shape >= _MAX_INT32:
+        k_type = torch.int64
+    table = HashTable(first.features.device, k_type, torch.int32, table_size)
+    scalars: List[torch.Tensor] = []
+    for ten in tens:
+        indices = ten.indices
+        if whole_shape >= _MAX_INT32:
+            indices = indices.long()
+        scalar = _indice_to_scalar(indices, shape)
+        scalars.append(scalar)
+        table.insert(scalar)
+    # assign arange to values of hash table
+    count = table.assign_arange_()
+    count_val = count.item()
+    out_features = torch.zeros([int(count_val), feat.shape[1]], dtype=feat.dtype, device=feat.device)
+    out_indices = torch.zeros([int(count_val), first.indices.shape[1]], dtype=first.indices.dtype, device=first.indices.device)
+
+    for ten, scalar in zip(tens, scalars):
+        out_inds, _ = table.query(scalar)
+        out_inds = out_inds.long()
+        out_features[out_inds] += ten.features
+        out_indices[out_inds] = ten.indices
+    res = SparseConvTensor(out_features, out_indices, first.spatial_shape, first.batch_size, 
+        benchmark=first.benchmark)
+    res.benchmark_record = first.benchmark_record
+    res._timer = first._timer 
+    res.thrust_allocator = first.thrust_allocator
+    return res 
+
+def sparse_add(a: SparseConvTensor, b: SparseConvTensor):
+    assert a.spatial_shape == b.spatial_shape
+    assert a.batch_size == b.batch_size
+    assert a.features.shape[1] == a.features.shape[1]
+    res_shape = [a.batch_size, *a.spatial_shape, a.features.shape[1]]
+
+    a_th = torch.sparse_coo_tensor(a.indices.T, a.features, res_shape, requires_grad=True)
+    b_th = torch.sparse_coo_tensor(b.indices.T, b.features, res_shape, requires_grad=True)
+
+    c_th = (a_th + b_th).coalesce()
+    c_th_inds = c_th.indices().T.contiguous().int()
+    c_th_values = c_th.values()
+    assert c_th_values.is_contiguous()
+    return SparseConvTensor(c_th_values, c_th_inds, a.spatial_shape, a.batch_size)
--- a/spconv/pytorch/hash.py
+++ b/spconv/pytorch/hash.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+import torch 
+from cumm import tensorview as tv 
+from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
+
+from spconv.core_cc.csrc.hash.core import HashTable as _HashTable
+
+_TORCH_DTYPE_TO_ITEMSIZE = {
+    torch.int32: 4,
+    torch.int64: 8,
+    torch.float32: 4,
+    torch.float64: 8,
+}
+
+class HashTable:
+    """simple hash table for 32 and 64 bit data. support both cpu and cuda.
+    for cuda, it's a fixed-size table, you must provide maximum size 
+    (recommend 2 * num).
+    see spconv/pytorch/functional/sparse_add_hash_based, a real example
+    that show how to use hash table to implement 
+    sparse add (same shape, different indices)
+    """
+    def __init__(self, device: torch.device, key_dtype: torch.dtype, 
+                value_dtype: torch.dtype, 
+                max_size: int = -1) -> None:
+        is_cpu = device.type == "cpu"
+        self.is_cpu = is_cpu
+        self.key_dtype = key_dtype
+        self.value_dtype = value_dtype
+        key_data_tv = tv.Tensor()
+        value_data_tv = tv.Tensor()
+        if is_cpu:
+            self.keys_data = None 
+            self.values_data = None 
+        else:
+            assert max_size > 0, "you must provide max_size for fixed-size cuda hash table, usually *2 of num of keys"
+            assert device is not None, "you must specify device for cuda hash table."
+            self.keys_data = torch.empty([max_size], dtype=key_dtype, device=device)
+            self.values_data = torch.empty([max_size], dtype=value_dtype, device=device)
+            key_data_tv = torch_tensor_to_tv(self.keys_data)
+            value_data_tv = torch_tensor_to_tv(self.values_data)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        self.key_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.key_dtype]
+        self.value_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.value_dtype]
+        self._valid_value_dtype_for_arange = set([torch.int32, torch.int64])
+
+        self._table = _HashTable(is_cpu, self.key_itemsize, self.value_itemsize, key_data_tv, value_data_tv, stream)
+
+
+    def insert(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
+        """insert hash table by keys and values
+        if values is None, only key is inserted, the value is undefined.
+        """
+        keys_tv = torch_tensor_to_tv(keys)
+        values_tv = tv.Tensor()
+        if values is not None:
+            values_tv = torch_tensor_to_tv(values)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+
+        return self._table.insert(keys_tv, values_tv, stream)
+
+    def query(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
+        keys_tv = torch_tensor_to_tv(keys)
+        if values is None:
+            values = torch.empty([keys.shape[0]], dtype=self.value_dtype, device=keys.device)
+        values_tv = torch_tensor_to_tv(values)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
+        is_empty_tv = torch_tensor_to_tv(is_empty)
+        self._table.query(keys_tv, values_tv, is_empty_tv, stream)
+        return values, is_empty
+
+    def assign_arange_(self):
+        count_tv = tv.Tensor()
+        count = torch.Tensor()
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        else:
+            assert self.value_dtype in self._valid_value_dtype_for_arange
+        if not self.is_cpu:
+            assert self.values_data is not None
+            if self.key_itemsize == 4:
+                count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
+            elif self.key_itemsize == 8:
+                count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
+            else:
+                raise NotImplementedError
+        else:
+            max_size = self._table.size_cpu()
+            count = torch.tensor([max_size], dtype=torch.int64)
+
+        self._table.assign_arange_(count_tv, stream)
+        return count
+
+    def items(self, max_size: int = -1):
+        count_tv = tv.Tensor()
+        count = torch.Tensor()
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        if not self.is_cpu:
+            assert self.values_data is not None
+            if self.key_itemsize == 4:
+                count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
+            elif self.key_itemsize == 8:
+                count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
+            else:
+                raise NotImplementedError
+        if not self.is_cpu:
+            assert self.values_data is not None
+            if max_size == -1:
+                max_size = self.values_data.shape[0]
+            keys = torch.empty([max_size], dtype=self.key_dtype, device=self.values_data.device)
+            values = torch.empty([max_size], dtype=self.value_dtype, device=self.values_data.device)
+
+        else:
+            max_size = self._table.size_cpu()
+            count = torch.tensor([max_size], dtype=torch.int64)
+            keys = torch.empty([max_size], dtype=self.key_dtype)
+            values = torch.empty([max_size], dtype=self.value_dtype)
+        keys_tv = torch_tensor_to_tv(keys)
+        values_tv = torch_tensor_to_tv(values)
+        self._table.items(keys_tv, values_tv, count_tv, stream)
+        return keys, values, count
+
+
+def main():
+    is_cpus = [True, False]
+    max_size = 1000
+    k_dtype = torch.int32 
+    v_dtype = torch.int64
+    for is_cpu in is_cpus:
+        if is_cpu:
+            dev = torch.device("cpu")
+            table = HashTable(dev, k_dtype, v_dtype)
+        else:
+            dev = torch.device("cuda:0")
+
+            table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
+
+        keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
+        values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
+        keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
+
+        table.insert(keys, values)
+
+        vq, _ = table.query(keys_query)
+        print(vq)
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+
+        table.assign_arange_()
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -22,6 +22,7 @@ import numpy as np
 import spconv
 from spconv.core import AlgoHint, ConvAlgo
 from typing import List, Optional, Union
+from spconv.pytorch.core import ThrustSortAllocator
 from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
 from spconv.core_cc.csrc.sparse.all import SpconvOps
 import spconv.core_cc as _ext
@@ -43,24 +44,6 @@ from spconv.tools import CUDAKernelTimer
 DEBUG = False


-class ThrustSortAllocator:
-    def __init__(self, device: torch.device) -> None:
-        super().__init__()
-        self.alloced_objs = {}
-
-        self.device = device
-
-    def alloc(self, n: int):
-        if n in self.alloced_objs:
-            return self.alloced_objs[n].data_ptr()
-        for n_cur, ten in self.alloced_objs.items():
-            if n < n_cur:
-                return ten.data_ptr()
-        ten = torch.empty([n], dtype=torch.uint8, device=self.device)
-        self.alloced_objs[n] = ten
-        return ten.data_ptr()
-
-
 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
    ndim = len(input_size)
    output_size = []
@@ -1482,3 +1465,4 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp,
                                             out_bp_tv, din_tv,
                                             indice_pairs_tv, stream)
    return din
+
--- a/spconv/pytorch/tables.py
+++ b/spconv/pytorch/tables.py
@@ -19,37 +19,68 @@ from torch.autograd import Function
 from spconv.pytorch.modules import SparseModule
 from spconv.pytorch.core import SparseConvTensor
 from typing import List
+from spconv.pytorch import functional as F


-class JoinTable(SparseModule):  # Module):
+class JoinTable(SparseModule):
    def forward(self, input: List[SparseConvTensor]):
+        msg = "you can't use JoinTable in two sptensor with different indices."
+
+        for ten in input:
+            assert ten.spatial_shape == input[0].spatial_shape, msg
+            assert ten.batch_size == input[0].batch_size, msg
+            assert ten.features.shape[1] == input[0].features.shape[1], msg
+            assert ten.indices.shape[0] == input[0].indices.shape[0], msg
        output = SparseConvTensor(torch.cat([i.features for i in input], 1),
                                  input[0].indices, input[0].spatial_shape,
                                  input[0].batch_size, input[0].grid,
                                  input[0].voxel_num, input[0].indice_dict)
        output.benchmark_record = input[1].benchmark_record
        output.thrust_allocator = input[1].thrust_allocator
+        output._timer = input[1]._timer
+
        return output

    def input_spatial_size(self, out_size):
        return out_size


-class AddTable(SparseModule):  # Module):
+class AddTable(SparseModule): 
    def forward(self, input: List[SparseConvTensor]):
+        msg = "you can't use AddTable in two sptensor with different indices. use AddTableMisaligned instead."
+        for ten in input:
+            assert ten.spatial_shape == input[0].spatial_shape, msg
+            assert ten.batch_size == input[0].batch_size, msg
+            assert ten.features.shape[1] == input[0].features.shape[1], msg
+            assert ten.indices.shape[0] == input[0].indices.shape[0], msg
+
        output = SparseConvTensor(sum([i.features for i in input]),
                                  input[0].indices, input[0].spatial_shape,
                                  input[0].batch_size, input[0].grid,
                                  input[0].voxel_num, input[0].indice_dict)
        output.benchmark_record = input[1].benchmark_record
        output.thrust_allocator = input[1].thrust_allocator
+        output._timer = input[1]._timer
+
        return output

    def input_spatial_size(self, out_size):
        return out_size

+class AddTableMisaligned(SparseModule):
+    """add sptensors with same shape but different indices.
+    slower than AddTable.
+    WARNING: you shouldn't use this in segmentation network such as U-Net
+    because add misaligned tensors will clear downsample indices and make 
+    SparseInverseConvXd not working.
+    """
+    def forward(self, input: List[SparseConvTensor]):
+        return F.sparse_add_hash_based(*input)
+
+    def input_spatial_size(self, out_size):
+        return out_size

-class ConcatTable(SparseModule):  # Module):
+class ConcatTable(SparseModule):
    def forward(self, input):
        return [module(input) for module in self._modules.values()]


--- a/spconv/pytorch/utils.py
+++ b/spconv/pytorch/utils.py
@@ -140,7 +140,6 @@ class PointToVoxel(object):
                num_voxels = res[0].shape[0]
            else:
                pc_tv = torch_tensor_to_tv(pc)
-                stream = get_current_stream()
                voxels_tv = torch_tensor_to_tv(self.voxels)
                indices_tv = torch_tensor_to_tv(self.indices)
                num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)

--- a/version.txt
+++ b/version.txt
-2.1.13
+2.1.14
\ No newline at end of file