Merge branch 'master' into develop

d03b947a · yan.yan · 9d1e33d6 · 8aa0f1f7 · d03b947a · d03b947a
Commit d03b947a authored Nov 29, 2021 by yan.yan
10 changed files
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -14,6 +14,7 @@
 import math
 import time
+import sys
 from typing import List, Optional, Tuple, Union
 import numpy as np
@@ -25,10 +26,11 @@ from torch.nn.parameter import Parameter
 from spconv import pytorch as spconv
 from spconv import SPCONV_VERSION_NUMBERS
 from spconv.core import ConvAlgo
+from spconv.debug_utils import spconv_save_debug_data
 from spconv.pytorch import functional as Fsp
 from spconv.pytorch import ops
 from spconv.cppconstants import CPU_ONLY_BUILD
-from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData
+from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData, expand_nd
 from spconv.pytorch.modules import SparseModule
 from spconv.constants import SAVED_WEIGHT_LAYOUT, ALL_WEIGHT_IS_KRSC
 from spconv.utils import nullcontext
@@ -109,32 +111,22 @@ class SparseConvolution(SparseModule):
                 name=None):
        super(SparseConvolution, self).__init__(name=name)
        assert groups == 1, "don't support groups for now"
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
-        if not isinstance(output_padding, (list, tuple)):
-            output_padding = [output_padding] * ndim
        self.ndim = ndim
        self.in_channels = in_channels
        self.out_channels = out_channels
-        self.kernel_size = kernel_size
+        self.kernel_size = expand_nd(ndim, kernel_size)
        kv = int(np.prod(kernel_size))
        kv_stride = int(np.prod(stride))
        self.conv1x1 = kv == 1
        # TODO we should deprecate support for ksize == 1 but stride != 1.
        if not subm:
            self.conv1x1 &= kv_stride == 1
-        self.stride = stride
+        self.stride = expand_nd(ndim, stride)
-        self.padding = padding
+        self.padding = expand_nd(ndim, padding)
-        self.dilation = dilation
+        self.dilation = expand_nd(ndim, dilation)
        self.transposed = transposed
        self.inverse = inverse
-        self.output_padding = output_padding
+        self.output_padding = expand_nd(ndim, output_padding)
        self.groups = groups
        self.subm = subm
        self.indice_key = indice_key
@@ -156,15 +148,15 @@ class SparseConvolution(SparseModule):
            if FILTER_HWIO:
                # RSCK
                self.weight = Parameter(
-                    torch.Tensor(*kernel_size, in_channels, out_channels))
+                    torch.Tensor(*self.kernel_size, in_channels, out_channels))
            else:
                # RSKC
                self.weight = Parameter(
-                    torch.Tensor(*kernel_size, out_channels, in_channels))
+                    torch.Tensor(*self.kernel_size, out_channels, in_channels))
        else:
            # KRSC
            self.weight = Parameter(
-                torch.Tensor(out_channels, *kernel_size, in_channels))
+                torch.Tensor(out_channels, *self.kernel_size, in_channels))
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
@@ -338,11 +330,21 @@ class SparseConvolution(SparseModule):
                        if input.benchmark:
                            torch.cuda.synchronize()
                            t = time.time()
+                        try:
                            outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
                                indices, batch_size, spatial_shape, algo,
                                self.kernel_size, self.stride, self.padding,
                                self.dilation, self.output_padding, self.subm,
                                self.transposed)
+                        except Exception as e:
+                            msg = "[Exception|native_pair]"
+                            msg += f"indices={indices.shape},bs={batch_size},ss={spatial_shape},"
+                            msg += f"algo={algo},ksize={self.kernel_size},stride={self.stride},"
+                            msg += f"padding={self.padding},dilation={self.dilation},subm={self.subm},"
+                            msg += f"transpose={self.transposed}"
+                            print(msg, file=sys.stderr)
+                            spconv_save_debug_data(indices)
+                            raise e 
                        if input.benchmark:
                            torch.cuda.synchronize()
                            interval = time.time() - t
@@ -356,7 +358,11 @@ class SparseConvolution(SparseModule):
                                                 spatial_shape,
                                                 out_spatial_shape,
                                                 is_subm=self.subm,
-                                                 algo=algo)
+                                                 algo=algo,
+                                                 ksize=self.kernel_size,
+                                                 stride=self.stride,
+                                                 padding=self.padding,
+                                                 dilation=self.dilation)
                        if self.indice_key is not None:
                            msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
                            assert self.indice_key not in indice_dict, msg
@@ -399,10 +405,7 @@ class SparseConvolution(SparseModule):
                    mask_argsort_bwd_splits = datas.mask_argsort_fwd_splits
                    masks = datas.masks
                    out_spatial_shape = datas.spatial_shape
-                    assert datas.pair_fwd.shape[0] == np.prod(
+                    assert datas.ksize == self.kernel_size, "inverse conv must have same kernel size as its couple conv"
-                        self.kernel_size
-                    ), "inverse conv must have same kernel size as its couple conv"
                else:
                    if self.indice_key is not None and datas is not None:
                        outids = datas.out_indices
@@ -413,10 +416,25 @@ class SparseConvolution(SparseModule):
                        mask_argsort_fwd_splits = datas.mask_argsort_fwd_splits
                        mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits
                        masks = datas.masks
+                        assert datas.is_subm, "only support reuse subm indices"
+                        if self.kernel_size != datas.ksize:
+                            raise ValueError(f"subm with same indice_key must have same kernel"
+                                f" size, expect {datas.ksize}, this layer {self.kernel_size}")
+                        if self.dilation != datas.dilation:
+                            raise ValueError(f"subm with same indice_key must have same dilation"
+                                f", expect {datas.dilation}, this layer {self.dilation}")
+                        if input.spatial_shape != datas.spatial_shape:
+                            raise ValueError(f"subm with same indice_key must have same spatial structure"
+                                f", expect {datas.spatial_shape}, input {spatial_shape}")
+                        if input.indices.shape[0] != datas.indices.shape[0]:
+                            raise ValueError(f"subm with same indice_key must have same num of indices"
+                                f", expect {datas.indices.shape[0]}, input {input.indices.shape[0]}")
                    else:
                        with input._timer.namespace("gen_pairs"):
                            # we need to gen bwd indices for regular conv
                            # because it may be inversed.
+                            try:
                                res = ops.get_indice_pairs_implicit_gemm(
                                    indices,
                                    batch_size,
@@ -432,6 +450,16 @@ class SparseConvolution(SparseModule):
                                    is_train=(not self.subm) or self.training,
                                    alloc=input.thrust_allocator,
                                    timer=input._timer)
+                            except Exception as e:
+                                msg = "[Exception|implicit_gemm_pair]"
+                                msg += f"indices={indices.shape},bs={batch_size},ss={spatial_shape},"
+                                msg += f"algo={algo},ksize={self.kernel_size},stride={self.stride},"
+                                msg += f"padding={self.padding},dilation={self.dilation},subm={self.subm},"
+                                msg += f"transpose={self.transposed}"
+                                print(msg, file=sys.stderr)
+                                spconv_save_debug_data(indices)
+                                raise e 
                        outids = res[0]
                        num_inds_per_loc = res[1]
                        pair_fwd = res[2]
@@ -455,7 +483,11 @@ class SparseConvolution(SparseModule):
                                is_subm=self.subm,
                                spatial_shape=spatial_shape,
                                out_spatial_shape=out_spatial_shape,
-                                algo=algo)
+                                algo=algo,
+                                ksize=self.kernel_size,
+                                stride=self.stride,
+                                padding=self.padding,
+                                dilation=self.dilation)
                            msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
                            assert self.indice_key not in indice_dict, msg
                            indice_dict[self.indice_key] = indice_data

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
 from spconv.core import ConvAlgo
 from spconv.pytorch.constants import PYTORCH_VERSION
-from spconv.pytorch.ops import ThrustSortAllocator
 from spconv.tools import CUDAKernelTimer
 if PYTORCH_VERSION >= [1, 8, 0]:
@@ -39,9 +38,28 @@ else:
        pass
+class ThrustSortAllocator:
+    def __init__(self, device: torch.device) -> None:
+        super().__init__()
+        self.alloced_objs = {}
+        self.device = device
+    def alloc(self, n: int):
+        if n in self.alloced_objs:
+            return self.alloced_objs[n].data_ptr()
+        for n_cur, ten in self.alloced_objs.items():
+            if n < n_cur:
+                return ten.data_ptr()
+        ten = torch.empty([n], dtype=torch.uint8, device=self.device)
+        self.alloced_objs[n] = ten
+        return ten.data_ptr()
 class IndiceData(object):
    def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
-                 spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo):
+                 spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo,
+                 ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]):
        self.out_indices = out_indices
        self.indices = indices
        self.indice_pairs = indice_pairs
@@ -50,6 +68,10 @@ class IndiceData(object):
        self.out_spatial_shape = out_spatial_shape
        self.is_subm = is_subm
        self.algo = algo
+        self.ksize = ksize
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
 class ImplicitGemmIndiceData(object):
@@ -60,7 +82,8 @@ class ImplicitGemmIndiceData(object):
                 mask_argsort_fwd_splits: List[torch.Tensor],
                 mask_argsort_bwd_splits: List[torch.Tensor],
                 masks: List[np.ndarray], spatial_shape, 
-                 out_spatial_shape, is_subm: bool, algo: ConvAlgo):
+                 out_spatial_shape, is_subm: bool, algo: ConvAlgo,
+                 ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]):
        self.out_indices = out_indices
        self.indices = indices
        self.pair_fwd = pair_fwd
@@ -74,6 +97,10 @@ class ImplicitGemmIndiceData(object):
        self.out_spatial_shape = out_spatial_shape
        self.is_subm = is_subm
        self.algo = algo
+        self.ksize = ksize
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
 def scatter_nd(indices, updates, shape):
@@ -225,3 +252,13 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
        tensor._timer = self._timer
        tensor.force_algo = self.force_algo
        return tensor
+def expand_nd(ndim: int, val: Union[int, List[int], Tuple[int, ...]]) -> List[int]:
+    if isinstance(val, int):
+        res = [val] * ndim 
+    elif isinstance(val, tuple):
+        res = list(val)
+    else:
+        res = val
+    assert len(res) == ndim
+    return res 
--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
@@ -56,6 +56,9 @@ def torch_tensor_to_tv(ten: torch.Tensor,
            return tv.from_blob(ptr, shape, dtype, tv_device)
    return tv.from_blob_strided(ptr, shape, stride, dtype, tv_device)
+def torch_tensors_to_tv(*tens: torch.Tensor):
+    return (torch_tensor_to_tv(t) for t in tens)
 def get_current_stream():
    return torch.cuda.current_stream().cuda_stream

--- a/spconv/pytorch/functional.py
+++ b/spconv/pytorch/functional.py
@@ -12,19 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
+import pickle 
 import torch
 from torch import nn
 from torch.autograd import Function
 from typing import Optional, TypeVar
 from spconv.pytorch.core import SparseConvTensor
 from spconv.tools import CUDAKernelTimer
-from spconv.pytorch import ops
+from spconv.pytorch import ops, SparseConvTensor
 from spconv.pytorch.constants import PYTORCH_VERSION
+from spconv.debug_utils import spconv_save_debug_data
 from torch.autograd.function import once_differentiable
 import numpy as np
+from pathlib import Path
+from spconv.pytorch.hash import HashTable
+from cumm.gemm.layout import to_stride
 from typing import List
+from functools import reduce 
+_MAX_INT32 = 2147483647
 _T = TypeVar("_T")
@@ -54,6 +62,7 @@ class SparseConvFunction(Function):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
        ctx.timer = timer
+        try:
            return ops.indice_conv(features,
                                filters,
                                indice_pairs,
@@ -62,6 +71,13 @@ class SparseConvFunction(Function):
                                False,
                                algo=algo,
                                timer=timer)
+        except Exception as e:
+            msg = "[Exception|indice_conv]"
+            msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
+            msg += f"pairnum={indice_pair_num},act={num_activate_out},algo={algo}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((indice_pairs, indice_pair_num))
+            raise e 
    @staticmethod
    @once_differentiable
@@ -69,7 +85,7 @@ class SparseConvFunction(Function):
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        timer = ctx.timer
+        try:
            input_bp, filters_bp = ops.indice_conv_backward(features,
                                                            filters,
                                                            grad_output,
@@ -78,6 +94,13 @@ class SparseConvFunction(Function):
                                                            False,
                                                            algo=ctx.algo,
                                                            timer=timer)
+        except Exception as e:
+            msg = "[Exception|indice_conv_backward]"
+            msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
+            msg += f"pairnum={indice_pair_num},do={grad_output.shape}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((indice_pairs, indice_pair_num))
+            raise e 
        return input_bp, filters_bp, None, None, None, None, None
@@ -96,7 +119,7 @@ class SparseInverseConvFunction(Function):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
        ctx.timer = timer
+        try:
            return ops.indice_conv(features,
                                filters,
                                indice_pairs,
@@ -106,6 +129,13 @@ class SparseInverseConvFunction(Function):
                                False,
                                algo=algo,
                                timer=timer)
+        except Exception as e:
+            msg = "[Exception|indice_conv|inverse]"
+            msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
+            msg += f"pairnum={indice_pair_num},act={num_activate_out},algo={algo}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((indice_pairs, indice_pair_num))
+            raise e 
    @staticmethod
    @once_differentiable
@@ -113,7 +143,7 @@ class SparseInverseConvFunction(Function):
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        timer = ctx.timer
+        try:
            input_bp, filters_bp = ops.indice_conv_backward(features,
                                                            filters,
                                                            grad_output,
@@ -123,6 +153,13 @@ class SparseInverseConvFunction(Function):
                                                            False,
                                                            algo=ctx.algo,
                                                            timer=timer)
+        except Exception as e:
+            msg = "[Exception|indice_conv_backward|inverse]"
+            msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
+            msg += f"pairnum={indice_pair_num},do={grad_output.shape}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((indice_pairs, indice_pair_num))
+            raise e 
        return input_bp, filters_bp, None, None, None, None, None
@@ -144,13 +181,23 @@ class SparseImplicitGemmFunction(Function):
                is_train: bool,
                is_subm: bool,
                timer: CUDAKernelTimer = CUDAKernelTimer(False)):
+        try:
            out, mask_out, mask_width = ops.implicit_gemm(features, filters,
                                                        pair_fwd,
                                                        pair_mask_fwd_splits,
                                                        mask_argsort_fwd_splits,
                                                        num_activate_out, masks,
                                                        is_train, is_subm, timer)
+        except Exception as e:
+            msg = "[Exception|implicit_gemm]"
+            msg += f"feat={features.shape},w={filters.shape},pair={pair_fwd.shape},"
+            msg += f"act={num_activate_out},issubm={is_subm},istrain={is_train}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((pair_fwd, pair_bwd, pair_mask_fwd_splits, 
+                pair_mask_bwd_splits, mask_argsort_fwd_splits, mask_argsort_bwd_splits,
+                masks))
+            raise e 
        ctx.save_for_backward(features, filters, pair_fwd, pair_bwd)
        ctx.mask_width = mask_width
        ctx.mask_out = mask_out
@@ -179,6 +226,7 @@ class SparseImplicitGemmFunction(Function):
        masks = ctx.masks
        is_subm = ctx.is_subm
        timer = ctx.timer
+        try:
            input_bp, filters_bp = ops.implicit_gemm_backward(
                features,
                filters,
@@ -194,6 +242,16 @@ class SparseImplicitGemmFunction(Function):
                mask_width=mask_width,
                is_subm=is_subm,
                timer=timer)
+        except Exception as e:
+            msg = "[Exception|implicit_gemm_backward]"
+            msg += f"feat={features.shape},w={filters.shape},pair={pair_fwd.shape},"
+            msg += f"issubm={is_subm},do={grad_output.shape}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((pair_fwd, pair_bwd, pair_mask_fwd_splits, 
+                pair_mask_bwd_splits, mask_argsort_fwd_splits, mask_argsort_bwd_splits,
+                masks))
+            raise e 
        None_9 = [None] * 11
        return (input_bp, filters_bp, *None_9)
@@ -212,6 +270,7 @@ class SubMConvFunction(Function):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
        ctx.timer = timer
+        try:
            return ops.indice_conv(features,
                                filters,
                                indice_pairs,
@@ -221,6 +280,13 @@ class SubMConvFunction(Function):
                                True,
                                algo=algo,
                                timer=timer)
+        except Exception as e:
+            msg = "[Exception|indice_conv|subm]"
+            msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
+            msg += f"pairnum={indice_pair_num},act={num_activate_out},algo={algo}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((indice_pairs, indice_pair_num))
+            raise e 
    @staticmethod
    @once_differentiable
@@ -228,7 +294,7 @@ class SubMConvFunction(Function):
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        timer = ctx.timer
+        try:
            input_bp, filters_bp = ops.indice_conv_backward(features,
                                                            filters,
                                                            grad_output,
@@ -238,6 +304,14 @@ class SubMConvFunction(Function):
                                                            True,
                                                            algo=ctx.algo,
                                                            timer=timer)
+        except Exception as e:
+            msg = "[Exception|indice_conv_backward|subm]"
+            msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
+            msg += f"pairnum={indice_pair_num},do={grad_output.shape}"
+            print(msg, file=sys.stderr)
+            spconv_save_debug_data((indice_pairs, indice_pair_num))
+            raise e 
        return input_bp, filters_bp, None, None, None, None, None
@@ -290,16 +364,98 @@ indice_maxpool = SparseMaxPoolFunction.apply
 indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
-def sparse_add(a: SparseConvTensor, b: SparseConvTensor):
+def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):
-    a_th = torch.sparse_coo_tensor(a.indices.T, a.features)
+    assert indices.shape[1] == len(shape)
-    b_th = torch.sparse_coo_tensor(b.indices.T, b.features)
+    stride = to_stride(np.array(shape, dtype=np.int64))
-    a_shape = a.spatial_shape
+    scalar_inds = indices[:, -1].clone()
-    b_shape = b.spatial_shape
+    for i in range(len(shape) - 1):
+        scalar_inds += stride[i] * indices[:, i]
-    res_shape = []
+    return scalar_inds.contiguous()
-    for sa, sb in zip(a_shape, b_shape):
-        res_shape.append(max(sa, sb))
+def sparse_add_hash_based(*tens: SparseConvTensor):
-    c_th = a_th + b_th 
+    """ sparse add with misaligned indices.
-    c_th_inds = c_th.indices().T.contiguous()
+    if you use sparse add, the indice_dict will be dropped and impossible
-    assert c_th.is_contiguous()
+    to use inverse.
-    return SparseConvTensor(c_th.values(), c_th_inds, res_shape, max(a.batch_size, b.batch_size))
+    There is only one situation that keep indices: there is one operand that
+    its indices is output indices.
+    """
+    table_size = 0
+    max_num_indices = 0
+    max_num_indices_idx = 0
+    for i, ten in enumerate(tens):
+        assert ten.spatial_shape == tens[0].spatial_shape
+        assert ten.batch_size == tens[0].batch_size
+        assert ten.features.shape[1] == tens[0].features.shape[1]
+        table_size += ten.features.shape[0]
+        if max_num_indices < ten.features.shape[0]:
+            max_num_indices_idx = i
+            max_num_indices = ten.features.shape[0]
+    first = tens[0]
+    feat = first.features
+    shape = [first.batch_size, *first.spatial_shape]
+    whole_shape = int(np.prod(shape))
+    table_size *= 2
+    k_type = torch.int32
+    if whole_shape >= _MAX_INT32:
+        k_type = torch.int64
+    table = HashTable(first.features.device, k_type, torch.int32, table_size)
+    scalars: List[torch.Tensor] = []
+    for ten in tens:
+        indices = ten.indices
+        if whole_shape >= _MAX_INT32:
+            indices = indices.long()
+        scalar = _indice_to_scalar(indices, shape)
+        scalars.append(scalar)
+        table.insert(scalar)
+    # assign arange to values of hash table
+    count = table.assign_arange_()
+    count_val = count.item()
+    out_features = torch.zeros([int(count_val), feat.shape[1]], dtype=feat.dtype, device=feat.device)
+    out_indices = torch.zeros([int(count_val), first.indices.shape[1]], dtype=first.indices.dtype, device=first.indices.device)
+    for ten, scalar in zip(tens, scalars):
+        out_inds, _ = table.query(scalar)
+        out_inds = out_inds.long()
+        out_features[out_inds] += ten.features
+        out_indices[out_inds] = ten.indices
+    res = SparseConvTensor(out_features, out_indices, first.spatial_shape, first.batch_size, 
+        benchmark=first.benchmark)
+    if count_val == max_num_indices:
+        res.indice_dict = tens[max_num_indices_idx].indice_dict
+    res.benchmark_record = first.benchmark_record
+    res._timer = first._timer 
+    res.thrust_allocator = first.thrust_allocator
+    return res 
+def sparse_add(*tens: SparseConvTensor):
+    """reuse torch.sparse. the internal is sort + unique 
+    """
+    max_num_indices = 0
+    max_num_indices_idx = 0
+    ten_ths: List[torch.Tensor] = []
+    first = tens[0]
+    res_shape = [first.batch_size, *first.spatial_shape, first.features.shape[1]]
+    for i, ten in enumerate(tens):
+        assert ten.spatial_shape == tens[0].spatial_shape
+        assert ten.batch_size == tens[0].batch_size
+        assert ten.features.shape[1] == tens[0].features.shape[1]
+        if max_num_indices < ten.features.shape[0]:
+            max_num_indices_idx = i
+            max_num_indices = ten.features.shape[0]
+        ten_ths.append(torch.sparse_coo_tensor(ten.indices.T, ten.features, res_shape, requires_grad=True))
+    c_th = reduce(lambda x, y: x + y, ten_ths).coalesce()
+    c_th_inds = c_th.indices().T.contiguous().int()
+    c_th_values = c_th.values()
+    assert c_th_values.is_contiguous()
+    res = SparseConvTensor(c_th_values, c_th_inds, first.spatial_shape, first.batch_size, 
+        benchmark=first.benchmark)
+    if c_th_values.shape[0] == max_num_indices:
+        res.indice_dict = tens[max_num_indices_idx].indice_dict
+    res.benchmark_record = first.benchmark_record
+    res._timer = first._timer 
+    res.thrust_allocator = first.thrust_allocator
+    return res 
--- a/spconv/pytorch/hash.py
+++ b/spconv/pytorch/hash.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch 
+from cumm import tensorview as tv 
+from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
+from spconv.core_cc.csrc.hash.core import HashTable as _HashTable
+_TORCH_DTYPE_TO_ITEMSIZE = {
+    torch.int32: 4,
+    torch.int64: 8,
+    torch.float32: 4,
+    torch.float64: 8,
+}
+class HashTable:
+    """simple hash table for 32 and 64 bit data. support both cpu and cuda.
+    for cuda, it's a fixed-size table, you must provide maximum size 
+    (recommend 2 * num).
+    see spconv/pytorch/functional/sparse_add_hash_based, a real example
+    that show how to use hash table to implement 
+    sparse add (same shape, different indices)
+    """
+    def __init__(self, device: torch.device, key_dtype: torch.dtype, 
+                value_dtype: torch.dtype, 
+                max_size: int = -1) -> None:
+        is_cpu = device.type == "cpu"
+        self.is_cpu = is_cpu
+        self.key_dtype = key_dtype
+        self.value_dtype = value_dtype
+        key_data_tv = tv.Tensor()
+        value_data_tv = tv.Tensor()
+        if is_cpu:
+            self.keys_data = None 
+            self.values_data = None 
+        else:
+            assert max_size > 0, "you must provide max_size for fixed-size cuda hash table, usually *2 of num of keys"
+            assert device is not None, "you must specify device for cuda hash table."
+            self.keys_data = torch.empty([max_size], dtype=key_dtype, device=device)
+            self.values_data = torch.empty([max_size], dtype=value_dtype, device=device)
+            key_data_tv = torch_tensor_to_tv(self.keys_data)
+            value_data_tv = torch_tensor_to_tv(self.values_data)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        self.key_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.key_dtype]
+        self.value_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.value_dtype]
+        self._valid_value_dtype_for_arange = set([torch.int32, torch.int64])
+        self._table = _HashTable(is_cpu, self.key_itemsize, self.value_itemsize, key_data_tv, value_data_tv, stream)
+    def insert(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
+        """insert hash table by keys and values
+        if values is None, only key is inserted, the value is undefined.
+        """
+        keys_tv = torch_tensor_to_tv(keys)
+        values_tv = tv.Tensor()
+        if values is not None:
+            values_tv = torch_tensor_to_tv(values)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        return self._table.insert(keys_tv, values_tv, stream)
+    def query(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
+        """query value by keys, if values is not None, create a new one.
+        return values and a uint8 tensor that whether query success.
+        """
+        keys_tv = torch_tensor_to_tv(keys)
+        if values is None:
+            values = torch.empty([keys.shape[0]], dtype=self.value_dtype, device=keys.device)
+        values_tv = torch_tensor_to_tv(values)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
+        is_empty_tv = torch_tensor_to_tv(is_empty)
+        self._table.query(keys_tv, values_tv, is_empty_tv, stream)
+        return values, is_empty
+    def insert_exist_keys(self, keys: torch.Tensor, values: torch.Tensor):
+        """insert kv that k exists in table. return a uint8 tensor that
+        whether insert success.
+        """
+        keys_tv = torch_tensor_to_tv(keys)
+        values_tv = torch_tensor_to_tv(values)
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        is_success = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
+        is_success_tv = torch_tensor_to_tv(is_success)
+        self._table.insert_exist_keys(keys_tv, values_tv, is_success_tv, stream)
+        return is_success
+    def assign_arange_(self):
+        """iterate table, assign values with "arange" value.
+        equivalent to 1. get key by items(), 2. use key and arange(key.shape[0]) to insert
+        """
+        count_tv = tv.Tensor()
+        count = torch.Tensor()
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        else:
+            assert self.value_dtype in self._valid_value_dtype_for_arange
+        if not self.is_cpu:
+            assert self.values_data is not None
+            if self.key_itemsize == 4:
+                count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
+            elif self.key_itemsize == 8:
+                count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
+            else:
+                raise NotImplementedError
+        else:
+            max_size = self._table.size_cpu()
+            count = torch.tensor([max_size], dtype=torch.int64)
+        self._table.assign_arange_(count_tv, stream)
+        return count
+    def items(self, max_size: int = -1):
+        count_tv = tv.Tensor()
+        count = torch.Tensor()
+        stream = 0
+        if not self.is_cpu:
+            stream = get_current_stream()
+        if not self.is_cpu:
+            assert self.values_data is not None
+            if self.key_itemsize == 4:
+                count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
+            elif self.key_itemsize == 8:
+                count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
+                count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
+            else:
+                raise NotImplementedError
+        if not self.is_cpu:
+            assert self.values_data is not None
+            if max_size == -1:
+                max_size = self.values_data.shape[0]
+            keys = torch.empty([max_size], dtype=self.key_dtype, device=self.values_data.device)
+            values = torch.empty([max_size], dtype=self.value_dtype, device=self.values_data.device)
+        else:
+            max_size = self._table.size_cpu()
+            count = torch.tensor([max_size], dtype=torch.int64)
+            keys = torch.empty([max_size], dtype=self.key_dtype)
+            values = torch.empty([max_size], dtype=self.value_dtype)
+        keys_tv = torch_tensor_to_tv(keys)
+        values_tv = torch_tensor_to_tv(values)
+        self._table.items(keys_tv, values_tv, count_tv, stream)
+        return keys, values, count
+def main():
+    is_cpus = [True, False]
+    max_size = 1000
+    k_dtype = torch.int32 
+    v_dtype = torch.int64
+    for is_cpu in is_cpus:
+        if is_cpu:
+            dev = torch.device("cpu")
+            table = HashTable(dev, k_dtype, v_dtype)
+        else:
+            dev = torch.device("cuda:0")
+            table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
+        keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
+        values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
+        keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
+        table.insert(keys, values)
+        vq, _ = table.query(keys_query)
+        print(vq)
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+        table.assign_arange_()
+        ks, vs, cnt = table.items()
+        cnt_item = cnt.item()
+        print(cnt, ks[:cnt_item], vs[:cnt_item])
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -22,6 +22,7 @@ import numpy as np
 import spconv
 from spconv.core import AlgoHint, ConvAlgo
 from typing import List, Optional, Union
+from spconv.pytorch.core import ThrustSortAllocator
 from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
 from spconv.core_cc.csrc.sparse.all import SpconvOps
 import spconv.core_cc as _ext
@@ -43,24 +44,6 @@ from spconv.tools import CUDAKernelTimer
 DEBUG = False
-class ThrustSortAllocator:
-    def __init__(self, device: torch.device) -> None:
-        super().__init__()
-        self.alloced_objs = {}
-        self.device = device
-    def alloc(self, n: int):
-        if n in self.alloced_objs:
-            return self.alloced_objs[n].data_ptr()
-        for n_cur, ten in self.alloced_objs.items():
-            if n < n_cur:
-                return ten.data_ptr()
-        ten = torch.empty([n], dtype=torch.uint8, device=self.device)
-        self.alloced_objs[n] = ten
-        return ten.data_ptr()
 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
    ndim = len(input_size)
    output_size = []
@@ -1531,3 +1514,4 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp,
                                             out_bp_tv, din_tv,
                                             indice_pairs_tv, stream)
    return din
--- a/spconv/pytorch/pool.py
+++ b/spconv/pytorch/pool.py
@@ -26,7 +26,7 @@ from spconv import pytorch as spconv
 from spconv.core import ConvAlgo
 from spconv.pytorch import functional as Fsp
 from spconv.pytorch import ops
-from spconv.pytorch.core import IndiceData, ImplicitGemmIndiceData
+from spconv.pytorch.core import IndiceData, ImplicitGemmIndiceData, expand_nd
 from spconv.pytorch.modules import SparseModule
 from spconv.cppconstants import CPU_ONLY_BUILD
 from spconv.utils import nullcontext
@@ -36,7 +36,7 @@ class SparseMaxPool(SparseModule):
    def __init__(self,
                 ndim,
                 kernel_size: Union[int, List[int], Tuple[int, ...]] = 3,
-                 stride: Union[int, List[int], Tuple[int, ...]] = 1,
+                 stride: Optional[Union[int, List[int], Tuple[int, ...]]] = 1,
                 padding: Union[int, List[int], Tuple[int, ...]] = 0,
                 dilation: Union[int, List[int], Tuple[int, ...]] = 1,
                 indice_key: Optional[str] = None,
@@ -44,22 +44,15 @@ class SparseMaxPool(SparseModule):
                 algo: Optional[ConvAlgo] = None,
                 name=None):
        super(SparseMaxPool, self).__init__(name=name)
-        if not isinstance(kernel_size, (list, tuple)):
-            kernel_size = [kernel_size] * ndim
-        if stride is None:
-            stride = kernel_size.copy()
-        if not isinstance(stride, (list, tuple)):
-            stride = [stride] * ndim
-        if not isinstance(padding, (list, tuple)):
-            padding = [padding] * ndim
-        if not isinstance(dilation, (list, tuple)):
-            dilation = [dilation] * ndim
        self.ndim = ndim
-        self.kernel_size = kernel_size
+        self.kernel_size = expand_nd(ndim, kernel_size)
-        self.stride = stride
+        if stride is None:
-        self.padding = padding
+            self.stride = self.kernel_size.copy()
+        else:
+            self.stride = expand_nd(ndim, stride)
+        self.padding = expand_nd(ndim, padding)
        self.subm = subm
-        self.dilation = dilation
+        self.dilation = expand_nd(ndim, dilation)
        self.indice_key = indice_key
        kv = int(np.prod(kernel_size))
        if algo is None:
@@ -155,7 +148,11 @@ class SparseMaxPool(SparseModule):
                                                 spatial_shape,
                                                 out_spatial_shape,
                                                 is_subm=False,
-                                                 algo=self.algo)
+                                                 algo=self.algo,
+                                                 ksize=self.kernel_size,
+                                                 stride=self.stride,
+                                                 padding=self.padding,
+                                                 dilation=self.dilation)
                        indice_dict[self.indice_key] = indice_data
                    else:
                        raise ValueError(
@@ -204,7 +201,11 @@ class SparseMaxPool(SparseModule):
                        is_subm=self.subm,
                        spatial_shape=spatial_shape,
                        out_spatial_shape=out_spatial_shape,
-                        algo=self.algo)
+                        algo=self.algo,
+                        ksize=self.kernel_size,
+                        stride=self.stride,
+                        padding=self.padding,
+                        dilation=self.dilation)
                    msg = f"your indice key {self.indice_key} already exists in this sparse tensor."
                    assert self.indice_key not in indice_dict, msg
                    indice_dict[self.indice_key] = indice_data

--- a/spconv/pytorch/tables.py
+++ b/spconv/pytorch/tables.py
@@ -19,37 +19,68 @@ from torch.autograd import Function
 from spconv.pytorch.modules import SparseModule
 from spconv.pytorch.core import SparseConvTensor
 from typing import List
+from spconv.pytorch import functional as F
-class JoinTable(SparseModule):  # Module):
+class JoinTable(SparseModule):
    def forward(self, input: List[SparseConvTensor]):
+        msg = "you can't use JoinTable in two sptensor with different indices."
+        for ten in input:
+            assert ten.spatial_shape == input[0].spatial_shape, msg
+            assert ten.batch_size == input[0].batch_size, msg
+            assert ten.features.shape[1] == input[0].features.shape[1], msg
+            assert ten.indices.shape[0] == input[0].indices.shape[0], msg
        output = SparseConvTensor(torch.cat([i.features for i in input], 1),
                                  input[0].indices, input[0].spatial_shape,
                                  input[0].batch_size, input[0].grid,
                                  input[0].voxel_num, input[0].indice_dict)
        output.benchmark_record = input[1].benchmark_record
        output.thrust_allocator = input[1].thrust_allocator
+        output._timer = input[1]._timer
        return output
    def input_spatial_size(self, out_size):
        return out_size
-class AddTable(SparseModule):  # Module):
+class AddTable(SparseModule): 
    def forward(self, input: List[SparseConvTensor]):
+        msg = "you can't use AddTable in two sptensor with different indices. use AddTableMisaligned instead."
+        for ten in input:
+            assert ten.spatial_shape == input[0].spatial_shape, msg
+            assert ten.batch_size == input[0].batch_size, msg
+            assert ten.features.shape[1] == input[0].features.shape[1], msg
+            assert ten.indices.shape[0] == input[0].indices.shape[0], msg
        output = SparseConvTensor(sum([i.features for i in input]),
                                  input[0].indices, input[0].spatial_shape,
                                  input[0].batch_size, input[0].grid,
                                  input[0].voxel_num, input[0].indice_dict)
        output.benchmark_record = input[1].benchmark_record
        output.thrust_allocator = input[1].thrust_allocator
+        output._timer = input[1]._timer
        return output
    def input_spatial_size(self, out_size):
        return out_size
+class AddTableMisaligned(SparseModule):
+    """add sptensors with same shape but different indices.
+    slower than AddTable.
+    WARNING: you shouldn't use this in segmentation network such as U-Net
+    because add misaligned tensors will clear downsample indices and make 
+    SparseInverseConvXd not working.
+    """
+    def forward(self, input: List[SparseConvTensor]):
+        return F.sparse_add_hash_based(*input)
+    def input_spatial_size(self, out_size):
+        return out_size
-class ConcatTable(SparseModule):  # Module):
+class ConcatTable(SparseModule):
    def forward(self, input):
        return [module(input) for module in self._modules.values()]

--- a/spconv/pytorch/utils.py
+++ b/spconv/pytorch/utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List
+from typing import List, Union
 import torch
 from cumm import tensorview as tv
@@ -140,7 +140,6 @@ class PointToVoxel(object):
                num_voxels = res[0].shape[0]
            else:
                pc_tv = torch_tensor_to_tv(pc)
-                stream = get_current_stream()
                voxels_tv = torch_tensor_to_tv(self.voxels)
                indices_tv = torch_tensor_to_tv(self.indices)
                num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)
@@ -158,12 +157,16 @@ class PointToVoxel(object):
                    self.num_per_voxel[:num_voxels], pc_voxel_id)
-def gather_features_by_pc_voxel_id(seg_res_features: torch.Tensor, pc_voxel_id: torch.Tensor):
+def gather_features_by_pc_voxel_id(seg_res_features: torch.Tensor, pc_voxel_id: torch.Tensor, invalid_value: Union[int, float] = 0):
    """This function is used to gather segmentation result to match origin pc.
    """
    if seg_res_features.device != pc_voxel_id.device:
        pc_voxel_id = pc_voxel_id.to(seg_res_features.device)
-    res = torch.zeros((pc_voxel_id.shape[0], seg_res_features.shape[1]), dtype=seg_res_features.dtype, device=seg_res_features.device)
+    res_feature_shape = (pc_voxel_id.shape[0], *seg_res_features.shape[1:])
+    if invalid_value == 0:
+        res = torch.zeros(res_feature_shape, dtype=seg_res_features.dtype, device=seg_res_features.device)
+    else:
+        res = torch.full(res_feature_shape, invalid_value, dtype=seg_res_features.dtype, device=seg_res_features.device)
    pc_voxel_id_valid = pc_voxel_id != -1
    pc_voxel_id_valid_ids = torch.nonzero(pc_voxel_id_valid).view(-1)
    seg_res_features_valid = seg_res_features[pc_voxel_id[pc_voxel_id_valid_ids]]

--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -16,6 +16,7 @@ import numpy as np
 from cumm import tensorview as tv
 from contextlib import AbstractContextManager
 from spconv.cppconstants import CPU_ONLY_BUILD
+from spconv.core_cc.csrc.utils.boxops import BoxOps
 from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
 from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
@@ -47,3 +48,69 @@ class nullcontext(AbstractContextManager):
    def __exit__(self, *excinfo):
        pass
+def rbbox_iou(box_corners: np.ndarray, qbox_corners: np.ndarray,
+              standup_iou: np.ndarray, standup_thresh: float):
+    if not BoxOps.has_boost():
+        raise NotImplementedError(
+            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
+        )
+    N = box_corners.shape[0]
+    K = qbox_corners.shape[0]
+    overlap = np.zeros((N, K), dtype=box_corners.dtype)
+    BoxOps.rbbox_iou(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners),
+                     tv.from_numpy(standup_iou), tv.from_numpy(overlap),
+                     standup_thresh, False)
+    return overlap
+def rbbox_intersection(box_corners: np.ndarray, qbox_corners: np.ndarray,
+                       standup_iou: np.ndarray, standup_thresh: float):
+    if not BoxOps.has_boost():
+        raise NotImplementedError(
+            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
+        )
+    N = box_corners.shape[0]
+    K = qbox_corners.shape[0]
+    overlap = np.zeros((N, K), dtype=box_corners.dtype)
+    BoxOps.rbbox_iou(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners),
+                     tv.from_numpy(standup_iou), tv.from_numpy(overlap),
+                     standup_thresh, True)
+    return overlap
+def rbbox_iou_loss(box_corners: np.ndarray, qbox_corners: np.ndarray):
+    if not BoxOps.has_boost():
+        raise NotImplementedError(
+            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
+        )
+    N = box_corners.shape[0]
+    overlap = np.zeros((N, ), dtype=box_corners.dtype)
+    BoxOps.rbbox_iou_aligned(tv.from_numpy(box_corners),
+                             tv.from_numpy(qbox_corners),
+                             tv.from_numpy(overlap), False)
+    return overlap
+def non_max_suppression_cpu(boxes: np.ndarray,
+                            order: np.ndarray,
+                            thresh: float,
+                            eps: float = 0.0):
+    return BoxOps.non_max_suppression_cpu(tv.from_numpy(boxes),
+                                          tv.from_numpy(order), thresh, eps)
+def rotate_non_max_suppression_cpu(boxes: np.ndarray, order: np.ndarray,
+                                   standup_iou: np.ndarray, thresh: float):
+    if not BoxOps.has_boost():
+        raise NotImplementedError(
+            "this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
+        )
+    return BoxOps.rotate_non_max_suppression_cpu(tv.from_numpy(boxes),
+                                                 tv.from_numpy(order),
+                                                 tv.from_numpy(standup_iou),
+                                                 thresh)