sync

7af751dc · yan.yan · 647927ce · 7af751dc · 7af751dc · 7af751dc
Commit 7af751dc authored Jul 12, 2022 by yan.yan
8 changed files
--- a/spconv/csrc/utils/boxops.py
+++ b/spconv/csrc/utils/boxops.py
@@ -23,7 +23,7 @@ class BoostGeometryLib(pccm.Class):
    def __init__(self):
        super().__init__()
        assert BOOST_ROOT is not None 
-        self.build_meta.add_includes(BOOST_ROOT)
+        self.build_meta.add_public_includes(BOOST_ROOT)
        self.add_include("boost/geometry.hpp")

 class BoxOps(pccm.Class):

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union, Dict

 import numpy as np
 import torch

--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
@@ -14,9 +14,10 @@

 from cumm import tensorview as tv
 import torch
-from typing import Optional, List
+from typing import Dict, Optional, List, Union
 from spconv.cppconstants import COMPILED_CUDA_ARCHS
 import sys 
+from spconv.core_cc.csrc.sparse.alloc import ExternalAllocator

 _TORCH_DTYPE_TO_TV = {
    torch.float32: tv.float32,
@@ -28,7 +29,10 @@ _TORCH_DTYPE_TO_TV = {
    torch.int16: tv.int16,
    torch.uint8: tv.uint8,
 }
+_TV_DTYPE_TO_TORCH = {v: k for k, v in _TORCH_DTYPE_TO_TV.items()}

+_TORCH_UINT_WORKAROUNDS = {tv.uint32: tv.int32, tv.uint16: tv.int16, tv.uint64: tv.int64}
+_ALL_INTS = {tv.int32, tv.int16, tv.int8, tv.int64, tv.uint64, tv.uint8, tv.uint32, tv.uint16}

 def torch_tensor_to_tv(ten: torch.Tensor,
                       dtype: Optional[int] = None,
@@ -46,7 +50,8 @@ def torch_tensor_to_tv(ten: torch.Tensor,
        shape = list(ten.shape)
    if dtype is None:
        dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
-    return tv.from_blob(ptr, shape, dtype, tv_device)
+    stride = ten.stride()
+    return tv.from_blob_strided(ptr, shape, list(stride), dtype, tv_device)

 def torch_tensors_to_tv(*tens: torch.Tensor):
    return (torch_tensor_to_tv(t) for t in tens)
@@ -63,6 +68,118 @@ def get_arch():
                f"available: {COMPILED_CUDA_ARCHS}", file=sys.stderr)
    return arch

+class TorchAllocator(ExternalAllocator):
+    def __init__(self, gpudevice: torch.device) -> None:
+        super().__init__()
+        self.gpudevice = gpudevice
+        self.cpudevice = torch.device("cpu:0")
+        self.allocated: Dict[Union[str, int], torch.Tensor] = {}
+
+    def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> tv.Tensor:
+        # provide a name if you want to access it after c++ function exit.
+        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
+        dtype_bkp = dtype
+        if dtype in _TORCH_UINT_WORKAROUNDS:
+            assert name == "", "must be temp memory for uint dtypes"
+            dtype = _TORCH_UINT_WORKAROUNDS[dtype]        
+        th_dtype = _TV_DTYPE_TO_TORCH[dtype]
+        if device == -1:
+            dev = self.cpudevice
+        else:
+            dev = self.gpudevice
+        ten = torch.zeros(shape, dtype=th_dtype, device=dev)
+        ten_tv = torch_tensor_to_tv(ten)
+        self.allocated[ten.data_ptr()] = ten
+        if name:
+            self.allocated[name] = ten
+        if torch_uint_workaround:
+            return ten_tv.type_view(dtype_bkp)
+        return ten_tv
+
+    def empty(self, name: str, shape: List[int], dtype: int, device: int) -> tv.Tensor:
+        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
+        dtype_bkp = dtype
+        if dtype in _TORCH_UINT_WORKAROUNDS:
+            assert name == "", "must be temp memory for uint dtypes"
+            dtype = _TORCH_UINT_WORKAROUNDS[dtype]        
+        th_dtype = _TV_DTYPE_TO_TORCH[dtype]
+        if device == -1:
+            dev = self.cpudevice
+        else:
+            dev = self.gpudevice
+        ten = torch.empty(shape, dtype=th_dtype, device=dev)
+        ten_tv = torch_tensor_to_tv(ten)
+        self.allocated[ten.data_ptr()] = ten
+        if name:
+            self.allocated[name] = ten
+        if torch_uint_workaround:
+            return ten_tv.type_view(dtype_bkp)
+        return ten_tv
+
+    def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> tv.Tensor:
+        if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
+            raise NotImplementedError("you can't use full for unsigned dtypes")
+        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
+        dtype_bkp = dtype
+        if dtype in _TORCH_UINT_WORKAROUNDS:
+            assert name == "", "must be temp memory for uint dtypes"
+            dtype = _TORCH_UINT_WORKAROUNDS[dtype]        
+
+        th_dtype = _TV_DTYPE_TO_TORCH[dtype]
+        if device == -1:
+            dev = self.cpudevice
+        else:
+            dev = self.gpudevice
+        ten = torch.full(shape, value, dtype=th_dtype, device=dev)
+        ten_tv = torch_tensor_to_tv(ten)
+        self.allocated[ten.data_ptr()] = ten
+        if name:
+            self.allocated[name] = ten
+        if name:
+            self.allocated[name] = ten
+        if torch_uint_workaround:
+            return ten_tv.type_view(dtype_bkp)
+        return ten_tv
+
+    def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> tv.Tensor:
+        if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
+            raise NotImplementedError("you can't use full for unsigned dtypes")
+        torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
+        dtype_bkp = dtype
+        if dtype in _TORCH_UINT_WORKAROUNDS:
+            assert name == "", "must be temp memory for uint dtypes"
+            dtype = _TORCH_UINT_WORKAROUNDS[dtype]        
+        th_dtype = _TV_DTYPE_TO_TORCH[dtype]
+        if device == -1:
+            dev = self.cpudevice
+        else:
+            dev = self.gpudevice
+        ten = torch.full(shape, value, dtype=th_dtype, device=dev)
+        ten_tv = torch_tensor_to_tv(ten)
+        self.allocated[ten.data_ptr()] = ten
+        if name:
+            self.allocated[name] = ten
+        if torch_uint_workaround:
+            return ten_tv.type_view(dtype_bkp)
+        return ten_tv
+
+    def free(self, ten: tv.Tensor):
+        if ten.storage_bytesize() != ten.bytesize():
+            raise ValueError("you can't free a sliced tensor.")
+        if ten.byte_pointer() in self.allocated:
+            self.allocated.pop(ten.byte_pointer())
+            return
+        raise ValueError("can't find your tensor in cache.")
+
+    def free_noexcept(self, ten: tv.Tensor):
+        # for c++ scope guard, free will be called in c++ destructor
+        if ten.storage_bytesize() != ten.bytesize():
+            return
+        if ten.byte_pointer() in self.allocated:
+            self.allocated.pop(ten.byte_pointer())
+            return
+
+
 if __name__ == "__main__":
    a = torch.rand(2, 2)
    atv = torch_tensor_to_tv(a)

--- a/spconv/pytorch/hash.py
+++ b/spconv/pytorch/hash.py
@@ -30,6 +30,7 @@ class HashTable:
    """simple hash table for 32 and 64 bit data. support both cpu and cuda.
    for cuda, it's a fixed-size table, you must provide maximum size 
    (recommend 2 * num).
+    key must be int32/int64.
    see spconv/pytorch/functional/sparse_add_hash_based, a real example
    that show how to use hash table to implement 
    sparse add (same shape, different indices)
@@ -91,7 +92,7 @@ class HashTable:
        is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
        is_empty_tv = torch_tensor_to_tv(is_empty)
        self._table.query(keys_tv, values_tv, is_empty_tv, stream)
-        return values, is_empty
+        return values, is_empty > 0

    def insert_exist_keys(self, keys: torch.Tensor, values: torch.Tensor):
        """insert kv that k exists in table. return a uint8 tensor that
@@ -105,7 +106,7 @@ class HashTable:
        is_success = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
        is_success_tv = torch_tensor_to_tv(is_success)
        self._table.insert_exist_keys(keys_tv, values_tv, is_success_tv, stream)
-        return is_success
+        return is_success > 0

    def assign_arange_(self):
        """iterate table, assign values with "arange" value.

--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
@@ -21,10 +21,12 @@ import torch
 import numpy as np
 import spconv
 from spconv.core import AlgoHint, ConvAlgo
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 from spconv.pytorch.core import ThrustSortAllocator
 from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream, get_arch
 from spconv.core_cc.csrc.sparse.all import SpconvOps
+from spconv.core_cc.csrc.sparse.alloc import ExternalAllocator
+
 import spconv.core_cc as _ext

 from spconv.utils import nullcontext
@@ -42,6 +44,8 @@ from cumm.gemm import codeops
 from spconv.tools import CUDAKernelTimer

 DEBUG = False
+DEBUG_INT64_HASH_K = True
+INT32_MAX = SpconvOps.get_int32_max()


 def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
@@ -69,6 +73,25 @@ def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
        output_size.append(size)
    return output_size

+class _HashData:
+    def __init__(self, num: int, use_i64: bool, device: torch.device) -> None:
+        if use_i64:
+            self.hashdata_k = torch.empty((num * 2, ),
+                                dtype=torch.int64,
+                                device=device)
+            self.hashdata_v = torch.empty((num* 2, ),
+                                dtype=torch.int32,
+                                device=device)
+            self.hashdata_k_tv = torch_tensor_to_tv(self.hashdata_k)
+            self.hashdata_v_tv = torch_tensor_to_tv(self.hashdata_v)
+
+        else:
+            self.hashdata = torch.empty((2, num * 2, ),
+                                dtype=torch.int32,
+                                device=device)
+            hashdata_tv = torch_tensor_to_tv(self.hashdata)
+            self.hashdata_k_tv = hashdata_tv[0]
+            self.hashdata_v_tv = hashdata_tv[1]

 def get_indice_pairs(indices: torch.Tensor,
                     batch_size: int,
@@ -105,7 +128,9 @@ def get_indice_pairs(indices: torch.Tensor,
        )
    assert algo == ConvAlgo.Native, "TODO"
    # indices = indices.cpu()
-
+    spatial_volume = functools.reduce(lambda x, y: x * y, spatial_shape, 1)
+    use_int64_hash_k = spatial_volume >= INT32_MAX or DEBUG_INT64_HASH_K
+    indice_dtype = torch.int64 if use_int64_hash_k else indices.dtype
    pair = torch.full((2, kv, indices.shape[0]),
                      -1,
                      dtype=indices.dtype,
@@ -121,14 +146,16 @@ def get_indice_pairs(indices: torch.Tensor,
        out_inds = indices
        if indices.is_cuda:
            stream = get_current_stream()
-            hashdata = torch.empty((out_inds.shape[0] * 2, ),
-                                   dtype=torch.int64,
-                                   device=indices.device)
+            hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
+            # hashdata = torch.empty((out_inds.shape[0] * 2, ),
+            #                        dtype=torch.int64,
+            #                        device=indices.device)
            out_inds_tv = torch_tensor_to_tv(out_inds)
-            hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+            # hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)

            SpconvOps.generate_subm_conv_inds(inds_tv,
-                                              hashdata_tv,
+                                              hashdata.hashdata_k_tv,
+                                              hashdata.hashdata_v_tv,
                                              pair_tv,
                                              out_inds_tv,
                                              indice_num_per_loc_tv,
@@ -154,7 +181,7 @@ def get_indice_pairs(indices: torch.Tensor,
        if indices.is_cuda:
            stream = get_current_stream()
            indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
-                                            dtype=indices.dtype,
+                                            dtype=indice_dtype,
                                            device=indices.device)
            indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)

@@ -183,15 +210,19 @@ def get_indice_pairs(indices: torch.Tensor,
            out_inds = torch.empty((num_act_out, indices.shape[1]),
                                   dtype=indices.dtype,
                                   device=indices.device)
-            hashdata = torch.empty((out_inds.shape[0] * 2, ),
-                                   dtype=torch.int64,
-                                   device=indices.device)
+            # hashdata = torch.empty((out_inds.shape[0] * 2, ),
+            #                        dtype=torch.int64,
+            #                        device=indices.device)
+            hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
+
            out_inds_tv = torch_tensor_to_tv(out_inds)
-            hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+            # hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
            SpconvOps.generate_conv_inds_stage2(inds_tv,
-                                                hashdata_tv,
+                                                hashdata.hashdata_k_tv,
+                                                hashdata.hashdata_v_tv,
                                                pair_tv,
                                                uniq_res_tv,
+                                                indice_pairs_uniq_tv,
                                                out_inds_tv,
                                                num_out_act=num_act_out,
                                                batch_size=batch_size,
@@ -267,6 +298,10 @@ def get_indice_pairs_implicit_gemm(
    kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
    # TODO in future we will support up to 128 kernel volume.
    assert kv <= 32, "currently only support kernel volume <= 32 to use implicit gemm"
+    spatial_volume = functools.reduce(lambda x, y: x * y, spatial_shape, 1)
+    use_int64_hash_k = spatial_volume >= INT32_MAX or DEBUG_INT64_HASH_K
+    indice_dtype = torch.int64 if use_int64_hash_k else indices.dtype
+
    if not subm:
        if transpose:
            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
@@ -316,19 +351,22 @@ def get_indice_pairs_implicit_gemm(

    if subm:
        out_inds = indices
-        hashdata = torch.empty((out_inds.shape[0] * 2, ),
-                               dtype=torch.int64,
-                               device=indices.device)
+        # hashdata = torch.empty((out_inds.shape[0] * 2, ),
+        #                        dtype=torch.int64,
+        #                        device=indices.device)
+        hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
+
        pair_mask = torch.empty((mask_split_count, indices.shape[0]),
                                dtype=torch.int32,
                                device=indices.device)

        out_inds_tv = torch_tensor_to_tv(out_inds)
-        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        # hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
        pair_mask_tv = torch_tensor_to_tv(pair_mask, dtype=tv.uint32)
        with timer.record("gen_subm_inds", stream):
            SpconvOps.generate_subm_conv_inds(inds_tv,
-                                              hashdata_tv,
+                                              hashdata.hashdata_k_tv,
+                                              hashdata.hashdata_v_tv,
                                              pair_tv,
                                              out_inds_tv,
                                              indice_num_per_loc_tv,
@@ -380,7 +418,7 @@ def get_indice_pairs_implicit_gemm(
        pair_bwd = pair
        pair_bwd_tv = pair_tv
        indice_pairs_uniq = torch.empty((pair.numel() + 1, ),
-                                        dtype=indices.dtype,
+                                        dtype=indice_dtype,
                                        device=indices.device)
        indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
        with timer.record("gen_conv_inds_stage1", stream):
@@ -433,12 +471,13 @@ def get_indice_pairs_implicit_gemm(
                                        device=indices.device)
            pair_mask_bwd_tv = torch_tensor_to_tv(pair_mask_bwd,
                                                  dtype=tv.uint32)
+        hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)

-        hashdata = torch.empty((out_inds.shape[0] * 2, ),
-                               dtype=torch.int64,
-                               device=indices.device)
+        # hashdata = torch.empty((out_inds.shape[0] * 2, ),
+        #                        dtype=torch.int64,
+        #                        device=indices.device)
        out_inds_tv = torch_tensor_to_tv(out_inds)
-        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        # hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
        if DEBUG:

            CONV.stream_synchronize(stream)
@@ -446,10 +485,12 @@ def get_indice_pairs_implicit_gemm(
            t = time.time()
        with timer.record("gen_conv_inds_stage2", stream):
            SpconvOps.generate_conv_inds_mask_stage2(inds_tv,
-                                                     hashdata_tv,
+                                                     hashdata.hashdata_k_tv,
+                                                     hashdata.hashdata_v_tv,
                                                     pair_fwd_tv,
                                                     pair_bwd_tv,
                                                     uniq_res_tv,
+                                                     indice_pairs_uniq_tv,
                                                     out_inds_tv,
                                                     pair_mask_fwd_tv,
                                                     pair_mask_bwd_tv,
@@ -1138,6 +1179,7 @@ def implicit_gemm(features: torch.Tensor,
    # CONV.stream_synchronize(stream)

    # t = time.time()
+    print(tune_res.algo_desp)
    with timer.record("implicit_gemm", stream):
        for j in range(num_split):
            beta = 0 if j == 0 else 1

--- a/test/dev.py
+++ b/test/dev.py
+import spconv 
+
+
--- a/test/dev2.py
+++ b/test/dev2.py
+from spconv.pytorch.cppcore import TorchAllocator
+print(1)
+
+from spconv.core_cc.csrc.sparse.all import SpconvOps
+import torch 
+print(2)
+if __name__ == "__main__":
+    alloc = TorchAllocator(torch.device("cuda:0"))
+
+    SpconvOps.test_allocator(alloc)
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -352,32 +352,36 @@ def scatter_nd(indices, updates, shape):

 class TestSpConv(TestCase):
    def testSpConv3d(self):
-        np.random.seed(484)
-        torch.manual_seed(48848)
+        np.random.seed(71)
+        torch.manual_seed(705)
        devices = ["cuda:0"]
-        shapes = [[19, 18, 17]]
+        shapes = [[4, 4, 4]]
        batchsizes = [1, 2]

-        in_channels = [32]
+        in_channels = [4]
        out_channels = [32, 48, 64]
        ksizes = [2, 3]
        strides = [1, 2, 3]
        paddings = [0, 1, 2]
        dilations = [1, 2, 3]
+        ksizes = [3]
+        strides = [1]
+        paddings = [0]
+        dilations = [1]
+
        algos = [
-            ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
-            ConvAlgo.MaskSplitImplicitGemm
+            ConvAlgo.MaskImplicitGemm,
+            # ConvAlgo.MaskSplitImplicitGemm
        ]
-        algos = [ConvAlgo.MaskSplitImplicitGemm]
+        # algos = [ConvAlgo.MaskSplitImplicitGemm]

        for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
                strides, paddings, dilations, algos):
            if all([s > 1, d > 1]):
                continue  # don't support this.
-            print(k, s, p, d)
            device = torch.device(dev)
-            num_points = [1000] * bs
+            num_points = [10] * bs
            dtype = torch.float32
            net = SparseConv3dTestTorch(1,
                                        3,
@@ -398,6 +402,9 @@ class TestSpConv(TestCase):
                np.float32)
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+            # print(k, s, p, d, features.mean(), indices.mean())
+            # if k == 2 and s == 2 and p == 0 and d == 1:
+            #     breakpoint()
            features_dense = sparse_dict["features_dense"].astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device).to(dtype)
@@ -829,4 +836,4 @@ if __name__ == '__main__':
    # main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
-    TestSpConv().testSpMaxPool3d()
+    TestSpConv().testSpConv3d()