Commit 7af751dc authored by yan.yan's avatar yan.yan
Browse files

sync

parent 647927ce
......@@ -23,7 +23,7 @@ class BoostGeometryLib(pccm.Class):
def __init__(self):
super().__init__()
assert BOOST_ROOT is not None
self.build_meta.add_includes(BOOST_ROOT)
self.build_meta.add_public_includes(BOOST_ROOT)
self.add_include("boost/geometry.hpp")
class BoxOps(pccm.Class):
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union, Dict
import numpy as np
import torch
......
......@@ -14,9 +14,10 @@
from cumm import tensorview as tv
import torch
from typing import Optional, List
from typing import Dict, Optional, List, Union
from spconv.cppconstants import COMPILED_CUDA_ARCHS
import sys
from spconv.core_cc.csrc.sparse.alloc import ExternalAllocator
_TORCH_DTYPE_TO_TV = {
torch.float32: tv.float32,
......@@ -28,7 +29,10 @@ _TORCH_DTYPE_TO_TV = {
torch.int16: tv.int16,
torch.uint8: tv.uint8,
}
_TV_DTYPE_TO_TORCH = {v: k for k, v in _TORCH_DTYPE_TO_TV.items()}
_TORCH_UINT_WORKAROUNDS = {tv.uint32: tv.int32, tv.uint16: tv.int16, tv.uint64: tv.int64}
_ALL_INTS = {tv.int32, tv.int16, tv.int8, tv.int64, tv.uint64, tv.uint8, tv.uint32, tv.uint16}
def torch_tensor_to_tv(ten: torch.Tensor,
dtype: Optional[int] = None,
......@@ -46,7 +50,8 @@ def torch_tensor_to_tv(ten: torch.Tensor,
shape = list(ten.shape)
if dtype is None:
dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, dtype, tv_device)
stride = ten.stride()
return tv.from_blob_strided(ptr, shape, list(stride), dtype, tv_device)
def torch_tensors_to_tv(*tens: torch.Tensor):
return (torch_tensor_to_tv(t) for t in tens)
......@@ -62,7 +67,119 @@ def get_arch():
f"may cause invalid device function. "
f"available: {COMPILED_CUDA_ARCHS}", file=sys.stderr)
return arch
class TorchAllocator(ExternalAllocator):
def __init__(self, gpudevice: torch.device) -> None:
super().__init__()
self.gpudevice = gpudevice
self.cpudevice = torch.device("cpu:0")
self.allocated: Dict[Union[str, int], torch.Tensor] = {}
def zeros(self, name: str, shape: List[int], dtype: int, device: int) -> tv.Tensor:
# provide a name if you want to access it after c++ function exit.
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
dtype_bkp = dtype
if dtype in _TORCH_UINT_WORKAROUNDS:
assert name == "", "must be temp memory for uint dtypes"
dtype = _TORCH_UINT_WORKAROUNDS[dtype]
th_dtype = _TV_DTYPE_TO_TORCH[dtype]
if device == -1:
dev = self.cpudevice
else:
dev = self.gpudevice
ten = torch.zeros(shape, dtype=th_dtype, device=dev)
ten_tv = torch_tensor_to_tv(ten)
self.allocated[ten.data_ptr()] = ten
if name:
self.allocated[name] = ten
if torch_uint_workaround:
return ten_tv.type_view(dtype_bkp)
return ten_tv
def empty(self, name: str, shape: List[int], dtype: int, device: int) -> tv.Tensor:
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
dtype_bkp = dtype
if dtype in _TORCH_UINT_WORKAROUNDS:
assert name == "", "must be temp memory for uint dtypes"
dtype = _TORCH_UINT_WORKAROUNDS[dtype]
th_dtype = _TV_DTYPE_TO_TORCH[dtype]
if device == -1:
dev = self.cpudevice
else:
dev = self.gpudevice
ten = torch.empty(shape, dtype=th_dtype, device=dev)
ten_tv = torch_tensor_to_tv(ten)
self.allocated[ten.data_ptr()] = ten
if name:
self.allocated[name] = ten
if torch_uint_workaround:
return ten_tv.type_view(dtype_bkp)
return ten_tv
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int) -> tv.Tensor:
if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
raise NotImplementedError("you can't use full for unsigned dtypes")
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
dtype_bkp = dtype
if dtype in _TORCH_UINT_WORKAROUNDS:
assert name == "", "must be temp memory for uint dtypes"
dtype = _TORCH_UINT_WORKAROUNDS[dtype]
th_dtype = _TV_DTYPE_TO_TORCH[dtype]
if device == -1:
dev = self.cpudevice
else:
dev = self.gpudevice
ten = torch.full(shape, value, dtype=th_dtype, device=dev)
ten_tv = torch_tensor_to_tv(ten)
self.allocated[ten.data_ptr()] = ten
if name:
self.allocated[name] = ten
if name:
self.allocated[name] = ten
if torch_uint_workaround:
return ten_tv.type_view(dtype_bkp)
return ten_tv
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int) -> tv.Tensor:
if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
raise NotImplementedError("you can't use full for unsigned dtypes")
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
dtype_bkp = dtype
if dtype in _TORCH_UINT_WORKAROUNDS:
assert name == "", "must be temp memory for uint dtypes"
dtype = _TORCH_UINT_WORKAROUNDS[dtype]
th_dtype = _TV_DTYPE_TO_TORCH[dtype]
if device == -1:
dev = self.cpudevice
else:
dev = self.gpudevice
ten = torch.full(shape, value, dtype=th_dtype, device=dev)
ten_tv = torch_tensor_to_tv(ten)
self.allocated[ten.data_ptr()] = ten
if name:
self.allocated[name] = ten
if torch_uint_workaround:
return ten_tv.type_view(dtype_bkp)
return ten_tv
def free(self, ten: tv.Tensor):
if ten.storage_bytesize() != ten.bytesize():
raise ValueError("you can't free a sliced tensor.")
if ten.byte_pointer() in self.allocated:
self.allocated.pop(ten.byte_pointer())
return
raise ValueError("can't find your tensor in cache.")
def free_noexcept(self, ten: tv.Tensor):
# for c++ scope guard, free will be called in c++ destructor
if ten.storage_bytesize() != ten.bytesize():
return
if ten.byte_pointer() in self.allocated:
self.allocated.pop(ten.byte_pointer())
return
if __name__ == "__main__":
a = torch.rand(2, 2)
atv = torch_tensor_to_tv(a)
......
......@@ -30,6 +30,7 @@ class HashTable:
"""simple hash table for 32 and 64 bit data. support both cpu and cuda.
for cuda, it's a fixed-size table, you must provide maximum size
(recommend 2 * num).
key must be int32/int64.
see spconv/pytorch/functional/sparse_add_hash_based, a real example
that show how to use hash table to implement
sparse add (same shape, different indices)
......@@ -91,7 +92,7 @@ class HashTable:
is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
is_empty_tv = torch_tensor_to_tv(is_empty)
self._table.query(keys_tv, values_tv, is_empty_tv, stream)
return values, is_empty
return values, is_empty > 0
def insert_exist_keys(self, keys: torch.Tensor, values: torch.Tensor):
"""insert kv that k exists in table. return a uint8 tensor that
......@@ -105,7 +106,7 @@ class HashTable:
is_success = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
is_success_tv = torch_tensor_to_tv(is_success)
self._table.insert_exist_keys(keys_tv, values_tv, is_success_tv, stream)
return is_success
return is_success > 0
def assign_arange_(self):
"""iterate table, assign values with "arange" value.
......
......@@ -21,10 +21,12 @@ import torch
import numpy as np
import spconv
from spconv.core import AlgoHint, ConvAlgo
from typing import List, Optional, Union
from typing import Dict, List, Optional, Union
from spconv.pytorch.core import ThrustSortAllocator
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream, get_arch
from spconv.core_cc.csrc.sparse.all import SpconvOps
from spconv.core_cc.csrc.sparse.alloc import ExternalAllocator
import spconv.core_cc as _ext
from spconv.utils import nullcontext
......@@ -42,6 +44,8 @@ from cumm.gemm import codeops
from spconv.tools import CUDAKernelTimer
DEBUG = False
DEBUG_INT64_HASH_K = True
INT32_MAX = SpconvOps.get_int32_max()
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
......@@ -69,6 +73,25 @@ def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_size.append(size)
return output_size
class _HashData:
def __init__(self, num: int, use_i64: bool, device: torch.device) -> None:
if use_i64:
self.hashdata_k = torch.empty((num * 2, ),
dtype=torch.int64,
device=device)
self.hashdata_v = torch.empty((num* 2, ),
dtype=torch.int32,
device=device)
self.hashdata_k_tv = torch_tensor_to_tv(self.hashdata_k)
self.hashdata_v_tv = torch_tensor_to_tv(self.hashdata_v)
else:
self.hashdata = torch.empty((2, num * 2, ),
dtype=torch.int32,
device=device)
hashdata_tv = torch_tensor_to_tv(self.hashdata)
self.hashdata_k_tv = hashdata_tv[0]
self.hashdata_v_tv = hashdata_tv[1]
def get_indice_pairs(indices: torch.Tensor,
batch_size: int,
......@@ -105,7 +128,9 @@ def get_indice_pairs(indices: torch.Tensor,
)
assert algo == ConvAlgo.Native, "TODO"
# indices = indices.cpu()
spatial_volume = functools.reduce(lambda x, y: x * y, spatial_shape, 1)
use_int64_hash_k = spatial_volume >= INT32_MAX or DEBUG_INT64_HASH_K
indice_dtype = torch.int64 if use_int64_hash_k else indices.dtype
pair = torch.full((2, kv, indices.shape[0]),
-1,
dtype=indices.dtype,
......@@ -121,14 +146,16 @@ def get_indice_pairs(indices: torch.Tensor,
out_inds = indices
if indices.is_cuda:
stream = get_current_stream()
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
# hashdata = torch.empty((out_inds.shape[0] * 2, ),
# dtype=torch.int64,
# device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
# hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_subm_conv_inds(inds_tv,
hashdata_tv,
hashdata.hashdata_k_tv,
hashdata.hashdata_v_tv,
pair_tv,
out_inds_tv,
indice_num_per_loc_tv,
......@@ -154,7 +181,7 @@ def get_indice_pairs(indices: torch.Tensor,
if indices.is_cuda:
stream = get_current_stream()
indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
dtype=indices.dtype,
dtype=indice_dtype,
device=indices.device)
indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
......@@ -183,15 +210,19 @@ def get_indice_pairs(indices: torch.Tensor,
out_inds = torch.empty((num_act_out, indices.shape[1]),
dtype=indices.dtype,
device=indices.device)
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
# hashdata = torch.empty((out_inds.shape[0] * 2, ),
# dtype=torch.int64,
# device=indices.device)
hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
# hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_conv_inds_stage2(inds_tv,
hashdata_tv,
hashdata.hashdata_k_tv,
hashdata.hashdata_v_tv,
pair_tv,
uniq_res_tv,
indice_pairs_uniq_tv,
out_inds_tv,
num_out_act=num_act_out,
batch_size=batch_size,
......@@ -267,6 +298,10 @@ def get_indice_pairs_implicit_gemm(
kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
# TODO in future we will support up to 128 kernel volume.
assert kv <= 32, "currently only support kernel volume <= 32 to use implicit gemm"
spatial_volume = functools.reduce(lambda x, y: x * y, spatial_shape, 1)
use_int64_hash_k = spatial_volume >= INT32_MAX or DEBUG_INT64_HASH_K
indice_dtype = torch.int64 if use_int64_hash_k else indices.dtype
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
......@@ -316,19 +351,22 @@ def get_indice_pairs_implicit_gemm(
if subm:
out_inds = indices
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
# hashdata = torch.empty((out_inds.shape[0] * 2, ),
# dtype=torch.int64,
# device=indices.device)
hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
pair_mask = torch.empty((mask_split_count, indices.shape[0]),
dtype=torch.int32,
device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
# hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
pair_mask_tv = torch_tensor_to_tv(pair_mask, dtype=tv.uint32)
with timer.record("gen_subm_inds", stream):
SpconvOps.generate_subm_conv_inds(inds_tv,
hashdata_tv,
hashdata.hashdata_k_tv,
hashdata.hashdata_v_tv,
pair_tv,
out_inds_tv,
indice_num_per_loc_tv,
......@@ -380,7 +418,7 @@ def get_indice_pairs_implicit_gemm(
pair_bwd = pair
pair_bwd_tv = pair_tv
indice_pairs_uniq = torch.empty((pair.numel() + 1, ),
dtype=indices.dtype,
dtype=indice_dtype,
device=indices.device)
indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
with timer.record("gen_conv_inds_stage1", stream):
......@@ -433,12 +471,13 @@ def get_indice_pairs_implicit_gemm(
device=indices.device)
pair_mask_bwd_tv = torch_tensor_to_tv(pair_mask_bwd,
dtype=tv.uint32)
hashdata = _HashData(out_inds.shape[0], use_int64_hash_k, indices.device)
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
# hashdata = torch.empty((out_inds.shape[0] * 2, ),
# dtype=torch.int64,
# device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
# hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
if DEBUG:
CONV.stream_synchronize(stream)
......@@ -446,10 +485,12 @@ def get_indice_pairs_implicit_gemm(
t = time.time()
with timer.record("gen_conv_inds_stage2", stream):
SpconvOps.generate_conv_inds_mask_stage2(inds_tv,
hashdata_tv,
hashdata.hashdata_k_tv,
hashdata.hashdata_v_tv,
pair_fwd_tv,
pair_bwd_tv,
uniq_res_tv,
indice_pairs_uniq_tv,
out_inds_tv,
pair_mask_fwd_tv,
pair_mask_bwd_tv,
......@@ -1138,6 +1179,7 @@ def implicit_gemm(features: torch.Tensor,
# CONV.stream_synchronize(stream)
# t = time.time()
print(tune_res.algo_desp)
with timer.record("implicit_gemm", stream):
for j in range(num_split):
beta = 0 if j == 0 else 1
......
import spconv
from spconv.pytorch.cppcore import TorchAllocator
print(1)
from spconv.core_cc.csrc.sparse.all import SpconvOps
import torch
print(2)
if __name__ == "__main__":
alloc = TorchAllocator(torch.device("cuda:0"))
SpconvOps.test_allocator(alloc)
......@@ -352,32 +352,36 @@ def scatter_nd(indices, updates, shape):
class TestSpConv(TestCase):
def testSpConv3d(self):
np.random.seed(484)
torch.manual_seed(48848)
np.random.seed(71)
torch.manual_seed(705)
devices = ["cuda:0"]
shapes = [[19, 18, 17]]
shapes = [[4, 4, 4]]
batchsizes = [1, 2]
in_channels = [32]
in_channels = [4]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [1, 2, 3]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
algos = [
ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
ConvAlgo.MaskSplitImplicitGemm
ConvAlgo.MaskImplicitGemm,
# ConvAlgo.MaskSplitImplicitGemm
]
algos = [ConvAlgo.MaskSplitImplicitGemm]
# algos = [ConvAlgo.MaskSplitImplicitGemm]
for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, algos):
if all([s > 1, d > 1]):
continue # don't support this.
print(k, s, p, d)
device = torch.device(dev)
num_points = [1000] * bs
num_points = [10] * bs
dtype = torch.float32
net = SparseConv3dTestTorch(1,
3,
......@@ -398,6 +402,9 @@ class TestSpConv(TestCase):
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
# print(k, s, p, d, features.mean(), indices.mean())
# if k == 2 and s == 2 and p == 0 and d == 1:
# breakpoint()
features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device).to(dtype)
......@@ -829,4 +836,4 @@ if __name__ == '__main__':
# main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
TestSpConv().testSpMaxPool3d()
TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment