Commit 076bdb05 authored by yan.yan's avatar yan.yan
Browse files

v2.1.14: add hash table, fix small bug

parent d406d9e2
...@@ -30,14 +30,14 @@ jobs: ...@@ -30,14 +30,14 @@ jobs:
- 'spconv/algo.py' - 'spconv/algo.py'
- 'spconv/core.py' - 'spconv/core.py'
- 'pyproject.toml' - 'pyproject.toml'
- name: Install CUDA - name: Install Boost
env: env:
CUDA_VERSION: ${{ matrix.cuda-version }} CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }} PYTHON_VERSION: ${{ matrix.python-version }}
cuda: ${{ matrix.cuda-version }} cuda: ${{ matrix.cuda-version }}
BOOST_VERSION: boost_1_77_0 BOOST_VERSION: boost_1_77_0
if: | if: |
(env.CUDA_VERSION != '') && ( (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
( (
(steps.changes.outputs.needbuild == 'true') && (steps.changes.outputs.needbuild == 'true') &&
...@@ -49,7 +49,24 @@ jobs: ...@@ -49,7 +49,24 @@ jobs:
$ProgressPreference = 'SilentlyContinue' $ProgressPreference = 'SilentlyContinue'
Invoke-WebRequest -Uri "https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.zip" -UseBasicParsing -OutFile $HOME/boost.zip Invoke-WebRequest -Uri "https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.zip" -UseBasicParsing -OutFile $HOME/boost.zip
Expand-Archive $HOME/boost.zip -DestinationPath $HOME/boost Expand-Archive $HOME/boost.zip -DestinationPath $HOME/boost
- name: Install CUDA
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
cuda: ${{ matrix.cuda-version }}
if: |
(env.CUDA_VERSION != '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
(
(steps.changes.outputs.needbuild == 'true') &&
(env.PYTHON_VERSION == '3.10')
)
)
shell: powershell
run: |
.\tools\install_windows_cuda.ps1 .\tools\install_windows_cuda.ps1
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
...@@ -66,6 +83,7 @@ jobs: ...@@ -66,6 +83,7 @@ jobs:
CUDA_VERSION: ${{ matrix.cuda-version }} CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }} PYTHON_VERSION: ${{ matrix.python-version }}
BOOST_VERSION: boost_1_77_0 BOOST_VERSION: boost_1_77_0
CUMM_CUDA_VERSION: ${{ matrix.cuda-version }}
if: | if: |
(env.CUDA_VERSION != '') && ( (env.CUDA_VERSION != '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
...@@ -75,7 +93,6 @@ jobs: ...@@ -75,7 +93,6 @@ jobs:
) )
) )
run: | run: |
$Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
$Env:CUMM_CUDA_ARCH_LIST = "all" $Env:CUMM_CUDA_ARCH_LIST = "all"
$Env:SPCONV_DISABLE_JIT = "1" $Env:SPCONV_DISABLE_JIT = "1"
pip install pccm pybind11 pip install pccm pybind11
......
# Changelog # Changelog
## [2.1.14] - 2021-11-28
### Added
- Add hash table
- update cumm version
- Add AddTableMisaligned for sptensors with same shape but different indices.
### Fixed
- Fix a bug already fixed in 2.1.10 but introduced in 2.1.12 again.
## [2.1.13] - 2021-?-? ## [2.1.13] - 2021-?-?
### Added ### Added
- Add some ops from spconv 1.x, see spconv.utils for more details. - Add some ops from spconv 1.x, see spconv.utils for more details.
......
...@@ -171,7 +171,9 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that ...@@ -171,7 +171,9 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that
5. run ```pip install pccm cumm wheel``` 5. run ```pip install pccm cumm wheel```
6. run ```python setup.py bdist_wheel```+```pip install dists/xxx.whl``` 6. run ```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
## Know issues
* Spconv 2.x F16 runs slow in A100.
## Note ## Note
......
...@@ -27,15 +27,17 @@ Network Code: test/benchmark.py ...@@ -27,15 +27,17 @@ Network Code: test/benchmark.py
| F16 Forward | Native| Implicit Gemm | Implicit Gemm Split Mask | | F16 Forward | Native| Implicit Gemm | Implicit Gemm Split Mask |
| -------------- |:---------------------:|---------------------:| ---------------------:| | -------------- |:---------------------:|---------------------:| ---------------------:|
| RTX 3080 Laptop 150W | 13.7ms | 11.2ms | 12.2ms | | RTX 3080 Laptop 150W@1755MHz | 13.7ms | 11.2ms | 12.2ms |
| RTX A6000 | 19.1ms | 11.7ms | 14.0ms | | RTX A6000 | 19.1ms | 11.7ms | 14.0ms |
| TESLA V100 | 17.9ms | 11.4ms | 13.4ms | | TESLA V100 | 17.9ms | 11.4ms | 13.4ms |
| A100 | 23.8ms | 12.4ms | 15.1ms |
| F16 Backward | Native| Implicit Gemm | Implicit Gemm Split Mask | | F16 Backward | Native| Implicit Gemm | Implicit Gemm Split Mask |
| -------------- |:---------------------:|---------------------:| ---------------------:| | -------------- |:---------------------:|---------------------:| ---------------------:|
| RTX 3080 Laptop 150W | 25.2ms | 13.8ms | 12.2ms | | RTX 3080 Laptop 150W@1755MHz | 25.2ms | 13.8ms | 12.2ms |
| RTX A6000 | 28.1ms | 9.2ms | 8.9ms | | RTX A6000 | 28.1ms | 9.2ms | 8.9ms |
| TESLA V100 | 33.9ms | 12.2ms | 12.9ms | | TESLA V100 | 33.9ms | 12.2ms | 12.9ms |
| A100 | 37.6ms | 12.2ms | 13.9ms |
### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU ### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from spconv.pytorch.hash import HashTable
def main():
"""Fixed-Size CUDA Hash Table:
this hash table can't delete keys after insert, and can't resize.
You need to pre-define a fixed-length of hash table, recommend 2x size
of your key num.
"""
is_cpus = [True, False]
max_size = 1000
k_dtype = torch.int32
v_dtype = torch.int64
for is_cpu in is_cpus:
if is_cpu:
dev = torch.device("cpu")
table = HashTable(dev, k_dtype, v_dtype)
else:
dev = torch.device("cuda:0")
table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
table.insert(keys, values)
vq, _ = table.query(keys_query)
print(vq)
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
table.assign_arange_()
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
if __name__ == "__main__":
main()
\ No newline at end of file
...@@ -29,6 +29,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -29,6 +29,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from spconv.csrc.sparse.all import SpconvOps from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.utils import BoxOps from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS +
SHUFFLE_TURING_PARAMS) SHUFFLE_TURING_PARAMS)
...@@ -40,7 +41,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -40,7 +41,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
if InWindows: if InWindows:
# windows have command line limit, so we use objects_folder to reduce command size. # windows have command line limit, so we use objects_folder to reduce command size.
objects_folder = "objects" objects_folder = "objects"
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps()], pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable()],
PACKAGE_ROOT / "core_cc", PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT, namespace_root=PACKAGE_ROOT,
objects_folder=objects_folder, objects_folder=objects_folder,
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class HashTable:
key_itemsize: int
value_itemsize: int
is_cpu: bool
insert_count: int
def __init__(self, is_cpu: bool, key_itemsize: int, value_itemsize: int, keys_data: Tensor, values_data: Tensor, stream: int = 0) -> None:
"""
Args:
is_cpu:
key_itemsize:
value_itemsize:
keys_data:
values_data:
stream:
"""
...
def clear(self, stream: int = 0) -> None:
"""
in this function, if values is empty, it will be assigned to zero.
Args:
stream:
"""
...
def insert(self, keys: Tensor, values: Tensor = Tensor(), stream: int = 0) -> None:
"""
in this function, if values is empty, it will be assigned to zero.
Args:
keys:
values:
stream:
"""
...
def query(self, keys: Tensor, values: Tensor, is_empty: Tensor, stream: int) -> None:
"""
query keys, save to values, and save is_empty to is_empty
Args:
keys:
values:
is_empty:
stream:
"""
...
def assign_arange_(self, count: Tensor, stream: int = 0) -> None:
"""
this function assign "arange(NumItem)" to table values.
useful in "unique-like" operations.
unlike insert/query, this method only support i32/i64/u32/u64 for value.
count must be u32/u64.
Args:
count:
stream:
"""
...
def size_cpu(self) -> int:
"""
this function can only be used to get cpu hash table size.
"""
...
def items(self, keys: Tensor, values: Tensor, count: Tensor, stream: int) -> None:
"""
get items.
Args:
keys:
values:
count:
stream:
"""
...
...@@ -14,36 +14,3 @@ class BoxOps: ...@@ -14,36 +14,3 @@ class BoxOps:
eps: eps:
""" """
... ...
@staticmethod
def rotate_non_max_suppression_cpu(box_corners: Tensor, order: Tensor, standup_iou: Tensor, thresh: float, eps: float = 0) -> List[int]:
"""
Args:
box_corners:
order:
standup_iou:
thresh:
eps:
"""
...
@staticmethod
def rbbox_iou(box_corners: Tensor, qbox_corners: Tensor, standup_iou: Tensor, overlaps: Tensor, standup_thresh: float, inter_only: bool) -> None:
"""
Args:
box_corners:
qbox_corners:
standup_iou:
overlaps:
standup_thresh:
inter_only:
"""
...
@staticmethod
def rbbox_iou_aligned(box_corners: Tensor, qbox_corners: Tensor, overlaps: Tensor, inter_only: bool) -> None:
"""
Args:
box_corners:
qbox_corners:
overlaps:
inter_only:
"""
...
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
This diff is collapsed.
...@@ -3,6 +3,7 @@ from pathlib import Path ...@@ -3,6 +3,7 @@ from pathlib import Path
import numpy as np import numpy as np
import torch import torch
from spconv.pytorch.core import SparseConvTensor
from spconv.pytorch import functional, ops from spconv.pytorch import functional, ops
from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d, from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
SparseConv4d, SparseConvTranspose1d, SparseConv4d, SparseConvTranspose1d,
...@@ -11,7 +12,6 @@ from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d, ...@@ -11,7 +12,6 @@ from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
SparseInverseConv2d, SparseInverseConv3d, SparseInverseConv2d, SparseInverseConv3d,
SparseInverseConv4d, SubMConv1d, SubMConv2d, SparseInverseConv4d, SubMConv1d, SubMConv2d,
SubMConv3d, SubMConv4d) SubMConv3d, SubMConv4d)
from spconv.pytorch.core import SparseConvTensor
from spconv.pytorch.identity import Identity from spconv.pytorch.identity import Identity
from spconv.pytorch.modules import (SparseModule, SparseSequential, from spconv.pytorch.modules import (SparseModule, SparseSequential,
assign_name_for_sparse_modules) assign_name_for_sparse_modules)
......
...@@ -18,7 +18,6 @@ import numpy as np ...@@ -18,7 +18,6 @@ import numpy as np
import torch import torch
from spconv.core import ConvAlgo from spconv.core import ConvAlgo
from spconv.pytorch.constants import PYTORCH_VERSION from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.pytorch.ops import ThrustSortAllocator
from spconv.tools import CUDAKernelTimer from spconv.tools import CUDAKernelTimer
if PYTORCH_VERSION >= [1, 8, 0]: if PYTORCH_VERSION >= [1, 8, 0]:
...@@ -39,6 +38,24 @@ else: ...@@ -39,6 +38,24 @@ else:
pass pass
class ThrustSortAllocator:
def __init__(self, device: torch.device) -> None:
super().__init__()
self.alloced_objs = {}
self.device = device
def alloc(self, n: int):
if n in self.alloced_objs:
return self.alloced_objs[n].data_ptr()
for n_cur, ten in self.alloced_objs.items():
if n < n_cur:
return ten.data_ptr()
ten = torch.empty([n], dtype=torch.uint8, device=self.device)
self.alloced_objs[n] = ten
return ten.data_ptr()
class IndiceData(object): class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num, def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo): spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo):
......
...@@ -46,6 +46,9 @@ def torch_tensor_to_tv(ten: torch.Tensor, ...@@ -46,6 +46,9 @@ def torch_tensor_to_tv(ten: torch.Tensor,
dtype = _TORCH_DTYPE_TO_TV[ten.dtype] dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, dtype, tv_device) return tv.from_blob(ptr, shape, dtype, tv_device)
def torch_tensors_to_tv(*tens: torch.Tensor):
return (torch_tensor_to_tv(t) for t in tens)
def get_current_stream(): def get_current_stream():
return torch.cuda.current_stream().cuda_stream return torch.cuda.current_stream().cuda_stream
......
...@@ -20,15 +20,18 @@ from torch import nn ...@@ -20,15 +20,18 @@ from torch import nn
from torch.autograd import Function from torch.autograd import Function
from typing import Optional, TypeVar from typing import Optional, TypeVar
from spconv.tools import CUDAKernelTimer from spconv.tools import CUDAKernelTimer
from spconv.pytorch import ops from spconv.pytorch import ops, SparseConvTensor
from spconv.pytorch.constants import PYTORCH_VERSION from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.debug_utils import spconv_save_debug_data from spconv.debug_utils import spconv_save_debug_data
from torch.autograd.function import once_differentiable from torch.autograd.function import once_differentiable
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
from spconv.pytorch.hash import HashTable
from cumm.gemm.layout import to_stride
from typing import List from typing import List
_MAX_INT32 = 2147483647
_T = TypeVar("_T") _T = TypeVar("_T")
def identity_decorator(func: _T) -> _T: def identity_decorator(func: _T) -> _T:
...@@ -357,3 +360,69 @@ indice_inverse_conv = SparseInverseConvFunction.apply ...@@ -357,3 +360,69 @@ indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply indice_maxpool = SparseMaxPoolFunction.apply
indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):
assert indices.shape[1] == len(shape)
stride = to_stride(np.array(shape, dtype=np.int64))
scalar_inds = indices[:, -1]
for i in range(len(shape) - 1):
scalar_inds += stride[i] * indices[:, i]
return scalar_inds.contiguous()
def sparse_add_hash_based(*tens: SparseConvTensor):
table_size = 0
for ten in tens:
assert ten.spatial_shape == tens[0].spatial_shape
assert ten.batch_size == tens[0].batch_size
assert ten.features.shape[1] == tens[0].features.shape[1]
table_size += ten.features.shape[0]
first = tens[0]
feat = first.features
shape = [first.batch_size, *first.spatial_shape]
whole_shape = int(np.prod(shape))
table_size *= 2
k_type = torch.int32
if whole_shape >= _MAX_INT32:
k_type = torch.int64
table = HashTable(first.features.device, k_type, torch.int32, table_size)
scalars: List[torch.Tensor] = []
for ten in tens:
indices = ten.indices
if whole_shape >= _MAX_INT32:
indices = indices.long()
scalar = _indice_to_scalar(indices, shape)
scalars.append(scalar)
table.insert(scalar)
# assign arange to values of hash table
count = table.assign_arange_()
count_val = count.item()
out_features = torch.zeros([int(count_val), feat.shape[1]], dtype=feat.dtype, device=feat.device)
out_indices = torch.zeros([int(count_val), first.indices.shape[1]], dtype=first.indices.dtype, device=first.indices.device)
for ten, scalar in zip(tens, scalars):
out_inds, _ = table.query(scalar)
out_inds = out_inds.long()
out_features[out_inds] += ten.features
out_indices[out_inds] = ten.indices
res = SparseConvTensor(out_features, out_indices, first.spatial_shape, first.batch_size,
benchmark=first.benchmark)
res.benchmark_record = first.benchmark_record
res._timer = first._timer
res.thrust_allocator = first.thrust_allocator
return res
def sparse_add(a: SparseConvTensor, b: SparseConvTensor):
assert a.spatial_shape == b.spatial_shape
assert a.batch_size == b.batch_size
assert a.features.shape[1] == a.features.shape[1]
res_shape = [a.batch_size, *a.spatial_shape, a.features.shape[1]]
a_th = torch.sparse_coo_tensor(a.indices.T, a.features, res_shape, requires_grad=True)
b_th = torch.sparse_coo_tensor(b.indices.T, b.features, res_shape, requires_grad=True)
c_th = (a_th + b_th).coalesce()
c_th_inds = c_th.indices().T.contiguous().int()
c_th_values = c_th.values()
assert c_th_values.is_contiguous()
return SparseConvTensor(c_th_values, c_th_inds, a.spatial_shape, a.batch_size)
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import torch
from cumm import tensorview as tv
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.hash.core import HashTable as _HashTable
_TORCH_DTYPE_TO_ITEMSIZE = {
torch.int32: 4,
torch.int64: 8,
torch.float32: 4,
torch.float64: 8,
}
class HashTable:
"""simple hash table for 32 and 64 bit data. support both cpu and cuda.
for cuda, it's a fixed-size table, you must provide maximum size
(recommend 2 * num).
see spconv/pytorch/functional/sparse_add_hash_based, a real example
that show how to use hash table to implement
sparse add (same shape, different indices)
"""
def __init__(self, device: torch.device, key_dtype: torch.dtype,
value_dtype: torch.dtype,
max_size: int = -1) -> None:
is_cpu = device.type == "cpu"
self.is_cpu = is_cpu
self.key_dtype = key_dtype
self.value_dtype = value_dtype
key_data_tv = tv.Tensor()
value_data_tv = tv.Tensor()
if is_cpu:
self.keys_data = None
self.values_data = None
else:
assert max_size > 0, "you must provide max_size for fixed-size cuda hash table, usually *2 of num of keys"
assert device is not None, "you must specify device for cuda hash table."
self.keys_data = torch.empty([max_size], dtype=key_dtype, device=device)
self.values_data = torch.empty([max_size], dtype=value_dtype, device=device)
key_data_tv = torch_tensor_to_tv(self.keys_data)
value_data_tv = torch_tensor_to_tv(self.values_data)
stream = 0
if not self.is_cpu:
stream = get_current_stream()
self.key_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.key_dtype]
self.value_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.value_dtype]
self._valid_value_dtype_for_arange = set([torch.int32, torch.int64])
self._table = _HashTable(is_cpu, self.key_itemsize, self.value_itemsize, key_data_tv, value_data_tv, stream)
def insert(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
"""insert hash table by keys and values
if values is None, only key is inserted, the value is undefined.
"""
keys_tv = torch_tensor_to_tv(keys)
values_tv = tv.Tensor()
if values is not None:
values_tv = torch_tensor_to_tv(values)
stream = 0
if not self.is_cpu:
stream = get_current_stream()
return self._table.insert(keys_tv, values_tv, stream)
def query(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
keys_tv = torch_tensor_to_tv(keys)
if values is None:
values = torch.empty([keys.shape[0]], dtype=self.value_dtype, device=keys.device)
values_tv = torch_tensor_to_tv(values)
stream = 0
if not self.is_cpu:
stream = get_current_stream()
is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
is_empty_tv = torch_tensor_to_tv(is_empty)
self._table.query(keys_tv, values_tv, is_empty_tv, stream)
return values, is_empty
def assign_arange_(self):
count_tv = tv.Tensor()
count = torch.Tensor()
stream = 0
if not self.is_cpu:
stream = get_current_stream()
else:
assert self.value_dtype in self._valid_value_dtype_for_arange
if not self.is_cpu:
assert self.values_data is not None
if self.key_itemsize == 4:
count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
elif self.key_itemsize == 8:
count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
else:
raise NotImplementedError
else:
max_size = self._table.size_cpu()
count = torch.tensor([max_size], dtype=torch.int64)
self._table.assign_arange_(count_tv, stream)
return count
def items(self, max_size: int = -1):
count_tv = tv.Tensor()
count = torch.Tensor()
stream = 0
if not self.is_cpu:
stream = get_current_stream()
if not self.is_cpu:
assert self.values_data is not None
if self.key_itemsize == 4:
count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
elif self.key_itemsize == 8:
count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
else:
raise NotImplementedError
if not self.is_cpu:
assert self.values_data is not None
if max_size == -1:
max_size = self.values_data.shape[0]
keys = torch.empty([max_size], dtype=self.key_dtype, device=self.values_data.device)
values = torch.empty([max_size], dtype=self.value_dtype, device=self.values_data.device)
else:
max_size = self._table.size_cpu()
count = torch.tensor([max_size], dtype=torch.int64)
keys = torch.empty([max_size], dtype=self.key_dtype)
values = torch.empty([max_size], dtype=self.value_dtype)
keys_tv = torch_tensor_to_tv(keys)
values_tv = torch_tensor_to_tv(values)
self._table.items(keys_tv, values_tv, count_tv, stream)
return keys, values, count
def main():
is_cpus = [True, False]
max_size = 1000
k_dtype = torch.int32
v_dtype = torch.int64
for is_cpu in is_cpus:
if is_cpu:
dev = torch.device("cpu")
table = HashTable(dev, k_dtype, v_dtype)
else:
dev = torch.device("cuda:0")
table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
table.insert(keys, values)
vq, _ = table.query(keys_query)
print(vq)
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
table.assign_arange_()
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
if __name__ == "__main__":
main()
\ No newline at end of file
...@@ -22,6 +22,7 @@ import numpy as np ...@@ -22,6 +22,7 @@ import numpy as np
import spconv import spconv
from spconv.core import AlgoHint, ConvAlgo from spconv.core import AlgoHint, ConvAlgo
from typing import List, Optional, Union from typing import List, Optional, Union
from spconv.pytorch.core import ThrustSortAllocator
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.sparse.all import SpconvOps from spconv.core_cc.csrc.sparse.all import SpconvOps
import spconv.core_cc as _ext import spconv.core_cc as _ext
...@@ -43,24 +44,6 @@ from spconv.tools import CUDAKernelTimer ...@@ -43,24 +44,6 @@ from spconv.tools import CUDAKernelTimer
DEBUG = False DEBUG = False
class ThrustSortAllocator:
def __init__(self, device: torch.device) -> None:
super().__init__()
self.alloced_objs = {}
self.device = device
def alloc(self, n: int):
if n in self.alloced_objs:
return self.alloced_objs[n].data_ptr()
for n_cur, ten in self.alloced_objs.items():
if n < n_cur:
return ten.data_ptr()
ten = torch.empty([n], dtype=torch.uint8, device=self.device)
self.alloced_objs[n] = ten
return ten.data_ptr()
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation): def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size) ndim = len(input_size)
output_size = [] output_size = []
...@@ -1482,3 +1465,4 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp, ...@@ -1482,3 +1465,4 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp,
out_bp_tv, din_tv, out_bp_tv, din_tv,
indice_pairs_tv, stream) indice_pairs_tv, stream)
return din return din
...@@ -19,37 +19,68 @@ from torch.autograd import Function ...@@ -19,37 +19,68 @@ from torch.autograd import Function
from spconv.pytorch.modules import SparseModule from spconv.pytorch.modules import SparseModule
from spconv.pytorch.core import SparseConvTensor from spconv.pytorch.core import SparseConvTensor
from typing import List from typing import List
from spconv.pytorch import functional as F
class JoinTable(SparseModule): # Module): class JoinTable(SparseModule):
def forward(self, input: List[SparseConvTensor]): def forward(self, input: List[SparseConvTensor]):
msg = "you can't use JoinTable in two sptensor with different indices."
for ten in input:
assert ten.spatial_shape == input[0].spatial_shape, msg
assert ten.batch_size == input[0].batch_size, msg
assert ten.features.shape[1] == input[0].features.shape[1], msg
assert ten.indices.shape[0] == input[0].indices.shape[0], msg
output = SparseConvTensor(torch.cat([i.features for i in input], 1), output = SparseConvTensor(torch.cat([i.features for i in input], 1),
input[0].indices, input[0].spatial_shape, input[0].indices, input[0].spatial_shape,
input[0].batch_size, input[0].grid, input[0].batch_size, input[0].grid,
input[0].voxel_num, input[0].indice_dict) input[0].voxel_num, input[0].indice_dict)
output.benchmark_record = input[1].benchmark_record output.benchmark_record = input[1].benchmark_record
output.thrust_allocator = input[1].thrust_allocator output.thrust_allocator = input[1].thrust_allocator
output._timer = input[1]._timer
return output return output
def input_spatial_size(self, out_size): def input_spatial_size(self, out_size):
return out_size return out_size
class AddTable(SparseModule): # Module): class AddTable(SparseModule):
def forward(self, input: List[SparseConvTensor]): def forward(self, input: List[SparseConvTensor]):
msg = "you can't use AddTable in two sptensor with different indices. use AddTableMisaligned instead."
for ten in input:
assert ten.spatial_shape == input[0].spatial_shape, msg
assert ten.batch_size == input[0].batch_size, msg
assert ten.features.shape[1] == input[0].features.shape[1], msg
assert ten.indices.shape[0] == input[0].indices.shape[0], msg
output = SparseConvTensor(sum([i.features for i in input]), output = SparseConvTensor(sum([i.features for i in input]),
input[0].indices, input[0].spatial_shape, input[0].indices, input[0].spatial_shape,
input[0].batch_size, input[0].grid, input[0].batch_size, input[0].grid,
input[0].voxel_num, input[0].indice_dict) input[0].voxel_num, input[0].indice_dict)
output.benchmark_record = input[1].benchmark_record output.benchmark_record = input[1].benchmark_record
output.thrust_allocator = input[1].thrust_allocator output.thrust_allocator = input[1].thrust_allocator
output._timer = input[1]._timer
return output return output
def input_spatial_size(self, out_size): def input_spatial_size(self, out_size):
return out_size return out_size
class AddTableMisaligned(SparseModule):
"""add sptensors with same shape but different indices.
slower than AddTable.
WARNING: you shouldn't use this in segmentation network such as U-Net
because add misaligned tensors will clear downsample indices and make
SparseInverseConvXd not working.
"""
def forward(self, input: List[SparseConvTensor]):
return F.sparse_add_hash_based(*input)
def input_spatial_size(self, out_size):
return out_size
class ConcatTable(SparseModule): # Module): class ConcatTable(SparseModule):
def forward(self, input): def forward(self, input):
return [module(input) for module in self._modules.values()] return [module(input) for module in self._modules.values()]
......
...@@ -140,7 +140,6 @@ class PointToVoxel(object): ...@@ -140,7 +140,6 @@ class PointToVoxel(object):
num_voxels = res[0].shape[0] num_voxels = res[0].shape[0]
else: else:
pc_tv = torch_tensor_to_tv(pc) pc_tv = torch_tensor_to_tv(pc)
stream = get_current_stream()
voxels_tv = torch_tensor_to_tv(self.voxels) voxels_tv = torch_tensor_to_tv(self.voxels)
indices_tv = torch_tensor_to_tv(self.indices) indices_tv = torch_tensor_to_tv(self.indices)
num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel) num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)
......
2.1.13 2.1.14
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment