Commit 076bdb05 authored by yan.yan's avatar yan.yan
Browse files

v2.1.14: add hash table, fix small bug

parent d406d9e2
......@@ -30,14 +30,14 @@ jobs:
- 'spconv/algo.py'
- 'spconv/core.py'
- 'pyproject.toml'
- name: Install CUDA
- name: Install Boost
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
cuda: ${{ matrix.cuda-version }}
BOOST_VERSION: boost_1_77_0
if: |
(env.CUDA_VERSION != '') && (
(
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
(
(steps.changes.outputs.needbuild == 'true') &&
......@@ -49,7 +49,24 @@ jobs:
$ProgressPreference = 'SilentlyContinue'
Invoke-WebRequest -Uri "https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.zip" -UseBasicParsing -OutFile $HOME/boost.zip
Expand-Archive $HOME/boost.zip -DestinationPath $HOME/boost
- name: Install CUDA
env:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
cuda: ${{ matrix.cuda-version }}
if: |
(env.CUDA_VERSION != '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
(
(steps.changes.outputs.needbuild == 'true') &&
(env.PYTHON_VERSION == '3.10')
)
)
shell: powershell
run: |
.\tools\install_windows_cuda.ps1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
......@@ -66,6 +83,7 @@ jobs:
CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }}
BOOST_VERSION: boost_1_77_0
CUMM_CUDA_VERSION: ${{ matrix.cuda-version }}
if: |
(env.CUDA_VERSION != '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
......@@ -75,7 +93,6 @@ jobs:
)
)
run: |
$Env:CUMM_CUDA_VERSION = "${{ matrix.cuda-version }}"
$Env:CUMM_CUDA_ARCH_LIST = "all"
$Env:SPCONV_DISABLE_JIT = "1"
pip install pccm pybind11
......
# Changelog
## [2.1.14] - 2021-11-28
### Added
- Add hash table
- update cumm version
- Add AddTableMisaligned for sptensors with same shape but different indices.
### Fixed
- Fix a bug already fixed in 2.1.10 but introduced in 2.1.12 again.
## [2.1.13] - 2021-?-?
### Added
- Add some ops from spconv 1.x, see spconv.utils for more details.
......
......@@ -171,7 +171,9 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that
5. run ```pip install pccm cumm wheel```
6. run ```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
## Know issues
* Spconv 2.x F16 runs slow in A100.
## Note
......
......@@ -27,15 +27,17 @@ Network Code: test/benchmark.py
| F16 Forward | Native| Implicit Gemm | Implicit Gemm Split Mask |
| -------------- |:---------------------:|---------------------:| ---------------------:|
| RTX 3080 Laptop 150W | 13.7ms | 11.2ms | 12.2ms |
| RTX 3080 Laptop 150W@1755MHz | 13.7ms | 11.2ms | 12.2ms |
| RTX A6000 | 19.1ms | 11.7ms | 14.0ms |
| TESLA V100 | 17.9ms | 11.4ms | 13.4ms |
| A100 | 23.8ms | 12.4ms | 15.1ms |
| F16 Backward | Native| Implicit Gemm | Implicit Gemm Split Mask |
| -------------- |:---------------------:|---------------------:| ---------------------:|
| RTX 3080 Laptop 150W | 25.2ms | 13.8ms | 12.2ms |
| RTX 3080 Laptop 150W@1755MHz | 25.2ms | 13.8ms | 12.2ms |
| RTX A6000 | 28.1ms | 9.2ms | 8.9ms |
| TESLA V100 | 33.9ms | 12.2ms | 12.9ms |
| A100 | 37.6ms | 12.2ms | 13.9ms |
### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from spconv.pytorch.hash import HashTable
def main():
"""Fixed-Size CUDA Hash Table:
this hash table can't delete keys after insert, and can't resize.
You need to pre-define a fixed-length of hash table, recommend 2x size
of your key num.
"""
is_cpus = [True, False]
max_size = 1000
k_dtype = torch.int32
v_dtype = torch.int64
for is_cpu in is_cpus:
if is_cpu:
dev = torch.device("cpu")
table = HashTable(dev, k_dtype, v_dtype)
else:
dev = torch.device("cuda:0")
table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
table.insert(keys, values)
vq, _ = table.query(keys_query)
print(vq)
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
table.assign_arange_()
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -29,6 +29,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS +
SHUFFLE_TURING_PARAMS)
......@@ -40,7 +41,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
if InWindows:
# windows have command line limit, so we use objects_folder to reduce command size.
objects_folder = "objects"
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps()],
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable()],
PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT,
objects_folder=objects_folder,
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class HashTable:
key_itemsize: int
value_itemsize: int
is_cpu: bool
insert_count: int
def __init__(self, is_cpu: bool, key_itemsize: int, value_itemsize: int, keys_data: Tensor, values_data: Tensor, stream: int = 0) -> None:
"""
Args:
is_cpu:
key_itemsize:
value_itemsize:
keys_data:
values_data:
stream:
"""
...
def clear(self, stream: int = 0) -> None:
"""
in this function, if values is empty, it will be assigned to zero.
Args:
stream:
"""
...
def insert(self, keys: Tensor, values: Tensor = Tensor(), stream: int = 0) -> None:
"""
in this function, if values is empty, it will be assigned to zero.
Args:
keys:
values:
stream:
"""
...
def query(self, keys: Tensor, values: Tensor, is_empty: Tensor, stream: int) -> None:
"""
query keys, save to values, and save is_empty to is_empty
Args:
keys:
values:
is_empty:
stream:
"""
...
def assign_arange_(self, count: Tensor, stream: int = 0) -> None:
"""
this function assign "arange(NumItem)" to table values.
useful in "unique-like" operations.
unlike insert/query, this method only support i32/i64/u32/u64 for value.
count must be u32/u64.
Args:
count:
stream:
"""
...
def size_cpu(self) -> int:
"""
this function can only be used to get cpu hash table size.
"""
...
def items(self, keys: Tensor, values: Tensor, count: Tensor, stream: int) -> None:
"""
get items.
Args:
keys:
values:
count:
stream:
"""
...
......@@ -14,36 +14,3 @@ class BoxOps:
eps:
"""
...
@staticmethod
def rotate_non_max_suppression_cpu(box_corners: Tensor, order: Tensor, standup_iou: Tensor, thresh: float, eps: float = 0) -> List[int]:
"""
Args:
box_corners:
order:
standup_iou:
thresh:
eps:
"""
...
@staticmethod
def rbbox_iou(box_corners: Tensor, qbox_corners: Tensor, standup_iou: Tensor, overlaps: Tensor, standup_thresh: float, inter_only: bool) -> None:
"""
Args:
box_corners:
qbox_corners:
standup_iou:
overlaps:
standup_thresh:
inter_only:
"""
...
@staticmethod
def rbbox_iou_aligned(box_corners: Tensor, qbox_corners: Tensor, overlaps: Tensor, inter_only: bool) -> None:
"""
Args:
box_corners:
qbox_corners:
overlaps:
inter_only:
"""
...
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from pathlib import Path
from cumm.constants import CUMM_CPU_ONLY_BUILD
import pccm
from cumm.common import (TensorView, TensorViewCPU, TensorViewHashKernel,
TensorViewKernel, TslRobinMap)
from spconv.csrc.sparse.cpu_core import OMPLib
class HashTable(pccm.Class, pccm.pybind.PybindClassMixin):
"""a simple hashtable for both cpu and cuda.
CPU implementation don't support parallel.
both cpu and cuda only support 32/64bit key value.
"""
def __init__(self):
super().__init__()
self.add_dependency(TensorView, TslRobinMap)
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_include("tensorview/parallel/all.h")
self.add_member("keys_data, values_data", "tv::Tensor")
self.add_pybind_member("key_itemsize_", "int", prop_name="key_itemsize", readwrite=False)
self.add_pybind_member("value_itemsize_", "int", prop_name="value_itemsize", readwrite=False)
self.add_pybind_member("is_cpu", "bool", readwrite=False)
self.add_member("map_4_4", "tsl::robin_map<uint32_t, uint32_t>")
self.add_member("map_4_8", "tsl::robin_map<uint32_t, uint64_t>")
self.add_member("map_8_4", "tsl::robin_map<uint64_t, uint32_t>")
self.add_member("map_8_8", "tsl::robin_map<uint64_t, uint64_t>")
self.add_pybind_member("insert_count_", "int64_t", prop_name="insert_count", readwrite=False)
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.FunctionCode()
code.arg("is_cpu", "bool")
code.arg("key_itemsize, value_itemsize", "int")
code.arg("keys_data", "tv::Tensor")
code.arg("values_data", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.ctor_init("is_cpu", "is_cpu")
code.ctor_init("keys_data", "keys_data")
code.ctor_init("values_data", "values_data")
code.ctor_init("key_itemsize_", "key_itemsize")
code.ctor_init("value_itemsize_", "value_itemsize")
code.ctor_init("insert_count_", "0")
code.raw(f"""
TV_ASSERT_RT_ERR(key_itemsize == 4 || key_itemsize == 8, "key_itemsize must be 4 or 8");
TV_ASSERT_RT_ERR(value_itemsize == 4 || value_itemsize == 8, "value_itemsize must be 4 or 8");
if (!is_cpu){{
TV_ASSERT_RT_ERR(!keys_data.empty() && !values_data.empty(), "key and value must not empty");
TV_ASSERT_RT_ERR(keys_data.dim(0) == values_data.dim(0), "key and value must have same size");
TV_ASSERT_RT_ERR(key_itemsize == keys_data.itemsize(), "key_itemsize must equal to key_data");
TV_ASSERT_RT_ERR(value_itemsize == values_data.itemsize(), "value_itemsize must equal to values_data");
// clear cuda table here.
clear(stream);
}}
""")
if CUMM_CPU_ONLY_BUILD:
code.raw(f"TV_ASSERT_RT_ERR(is_cpu, \"spconv not built with CUDA\");")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def clear(self):
""" in this function, if values is empty, it will be assigned to zero.
"""
code = pccm.FunctionCode()
if not CUMM_CPU_ONLY_BUILD:
code.add_dependency(TensorViewHashKernel)
code.arg("stream", "std::uintptr_t", "0")
with code.if_("is_cpu"):
code.raw(f"""
if (is_cpu){{
map_4_4.clear();
map_4_8.clear();
map_8_4.clear();
map_8_8.clear();
return;
}}
""")
if not CUMM_CPU_ONLY_BUILD:
with code.else_():
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
constexpr int IKV = TV_DECLTYPE(IK)::value;
using K = tv::hash::itemsize_to_unsigned_t<IKV>;
constexpr K kEmptyKey = std::numeric_limits<K>::max();
K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
constexpr int IVV = TV_DECLTYPE(IV)::value;
using V = tv::hash::itemsize_to_unsigned_t<IVV>;
V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
using table_t =
tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
kEmptyKey, false>;
table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
tv::cuda::Launch launcher(table.size(), custream);
launcher(tv::hash::clear_table_split<table_t>, table);
}});
}});
""")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def insert(self):
""" in this function, if values is empty, it will be assigned to zero.
"""
code = pccm.FunctionCode()
if not CUMM_CPU_ONLY_BUILD:
code.add_dependency(TensorViewHashKernel)
code.arg("keys", "tv::Tensor")
code.arg("values", "tv::Tensor", "tv::Tensor()", pyanno="cumm.tensorview.Tensor = Tensor()")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
if (!is_cpu){{
int64_t value_after_insert = keys.dim(0) + insert_count_;
TV_ASSERT_RT_ERR(value_after_insert < keys_data.dim(0), "inserted count exceed maximum hash size");
insert_count_ += keys.dim(0);
}}
auto N = keys.dim(0);
TV_ASSERT_RT_ERR(keys.itemsize() == key_itemsize_, "keys itemsize not equal to", key_itemsize_);
if (!values.empty()){{
TV_ASSERT_RT_ERR(values.itemsize() == value_itemsize_, "values itemsize not equal to", value_itemsize_);
TV_ASSERT_RT_ERR(keys.dim(0) == values.dim(0), "number of key and value must same");
}}
""")
with code.if_("is_cpu"):
map_name = "cpu_map"
for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
code.raw(f"""
auto k_ptr = reinterpret_cast<const {k_type}*>(keys.raw_data());
if (values.empty()){{
for (size_t i = 0; i < N; ++i){{
{map_name}.insert({{k_ptr[i], {v_type}(0)}});
}}
}}
else{{
auto v_ptr = reinterpret_cast<const {v_type}*>(values.raw_data());
for (size_t i = 0; i < N; ++i){{
{map_name}.insert({{k_ptr[i], v_ptr[i]}});
}}
}}
""")
if not CUMM_CPU_ONLY_BUILD:
with code.else_():
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
constexpr int IKV = TV_DECLTYPE(IK)::value;
using K = tv::hash::itemsize_to_unsigned_t<IKV>;
constexpr K kEmptyKey = std::numeric_limits<K>::max();
K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
const K* key_ptr = reinterpret_cast<const K*>(keys.raw_data());
tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
constexpr int IVV = TV_DECLTYPE(IV)::value;
using V = tv::hash::itemsize_to_unsigned_t<IVV>;
V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
const V* value_ptr = reinterpret_cast<const V*>(values.raw_data());
using table_t =
tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
kEmptyKey, false>;
tv::cuda::Launch launcher(N, custream);
table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
launcher(tv::hash::insert_split<table_t>, table, key_ptr, value_ptr, size_t(N));
}});
}});
""")
else:
code.raw(f"""
TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
""")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def query(self):
"""query keys, save to values, and save is_empty to is_empty
"""
code = pccm.FunctionCode()
if not CUMM_CPU_ONLY_BUILD:
code.add_dependency(TensorViewHashKernel)
code.arg("keys", "tv::Tensor")
code.arg("values", "tv::Tensor")
code.arg("is_empty", "tv::Tensor")
code.arg("stream", "std::uintptr_t")
code.raw(f"""
auto N = keys.dim(0);
TV_ASSERT_RT_ERR(keys.itemsize() == key_itemsize_, "keys itemsize not equal to", key_itemsize_);
TV_ASSERT_RT_ERR(values.itemsize() == value_itemsize_, "values itemsize not equal to", value_itemsize_);
TV_ASSERT_RT_ERR(N == values.dim(0) && is_empty.dim(0) == N, "number of key and value must same");
auto is_empty_ptr = is_empty.data_ptr<uint8_t>();
""")
with code.if_("is_cpu"):
map_name = "cpu_map"
# here it's safe to use omp in query.
for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
code.raw(f"""
auto k_ptr = reinterpret_cast<{k_type}*>(keys.raw_data());
auto v_ptr = reinterpret_cast<{v_type}*>(values.raw_data());
tv::kernel_1d_cpu(keys.device(), N, [&](size_t begin, size_t end, size_t step){{
bool emp;
for (size_t i = begin; i < end; i += step){{
auto iter = {map_name}.find(k_ptr[i]);
emp = iter == {map_name}.end();
if (!emp){{
v_ptr[i] = iter->second;
}}
is_empty_ptr[i] = uint8_t(emp);
}}
}});
""")
if not CUMM_CPU_ONLY_BUILD:
with code.else_():
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
constexpr int IKV = TV_DECLTYPE(IK)::value;
using K = tv::hash::itemsize_to_unsigned_t<IKV>;
constexpr K kEmptyKey = std::numeric_limits<K>::max();
K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
K* key_ptr = reinterpret_cast<K*>(keys.raw_data());
tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
constexpr int IVV = TV_DECLTYPE(IV)::value;
using V = tv::hash::itemsize_to_unsigned_t<IVV>;
V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
V* value_ptr = reinterpret_cast<V*>(values.raw_data());
using table_t =
tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
kEmptyKey, false>;
tv::cuda::Launch launcher(N, custream);
table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
launcher(tv::hash::query_split<table_t>, table, key_ptr, value_ptr, is_empty_ptr, size_t(N));
}});
}});
""")
else:
code.raw(f"""
TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
""")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def assign_arange_(self):
""" this function assign "arange(NumItem)" to table values.
useful in "unique-like" operations.
unlike insert/query, this method only support i32/i64/u32/u64 for value.
count must be u32/u64.
"""
code = pccm.FunctionCode()
if not CUMM_CPU_ONLY_BUILD:
code.add_dependency(TensorViewHashKernel)
code.arg("count", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
with code.if_("is_cpu"):
map_name = "cpu_map"
for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
code.raw(f"""
{v_type} index = 0;
for (auto it = {map_name}.begin(); it != {map_name}.end(); ++it){{
it.value() = index;
++index;
}}
""")
if not CUMM_CPU_ONLY_BUILD:
with code.else_():
code.raw(f"""
TV_ASSERT_RT_ERR(count.device() == 0, "count must be cuda");
auto custream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
constexpr int IKV = TV_DECLTYPE(IK)::value;
using K = tv::hash::itemsize_to_unsigned_t<IKV>;
constexpr K kEmptyKey = std::numeric_limits<K>::max();
auto count_ptr = count.data_ptr<K>();
K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
tv::dispatch<int32_t, int64_t, uint32_t, uint64_t>(values_data.dtype(), [&](auto IV){{
using V = TV_DECLTYPE(IV);
V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
using table_t =
tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
kEmptyKey, false>;
table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
tv::cuda::Launch launcher(table.size(), custream);
launcher(tv::hash::assign_arange_split<table_t, K>, table, count_ptr);
}});
}});
""")
else:
code.raw(f"""
TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
""")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def size_cpu(self):
""" this function can only be used to get cpu hash table size.
"""
code = pccm.FunctionCode()
code.raw(f"""
int64_t res = -1;
TV_ASSERT_RT_ERR(is_cpu, "size_cpu can only be used in cpu hash table");
""")
with code.if_("is_cpu"):
map_name = "cpu_map"
for _ in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
code.raw(f"""
res = {map_name}.size();
""")
code.raw(f"return res;")
return code.ret("int64_t")
@pccm.pybind.mark
@pccm.cuda.member_function
def items(self):
"""get items.
"""
code = pccm.FunctionCode()
if not CUMM_CPU_ONLY_BUILD:
code.add_dependency(TensorViewHashKernel)
code.arg("keys", "tv::Tensor")
code.arg("values", "tv::Tensor")
code.arg("count", "tv::Tensor")
code.arg("stream", "std::uintptr_t")
code.raw(f"""
auto N = keys.dim(0);
TV_ASSERT_RT_ERR(keys.itemsize() == key_itemsize_, "keys itemsize not equal to", key_itemsize_);
TV_ASSERT_RT_ERR(values.itemsize() == value_itemsize_, "values itemsize not equal to", value_itemsize_);
TV_ASSERT_RT_ERR(N == values.dim(0), "number of key and value must same");
""")
with code.if_("is_cpu"):
map_name = "cpu_map"
# here it's safe to use omp in query.
for k_type, v_type in self.cpu_map_storage_select("key_itemsize_", "value_itemsize_", map_name, code):
code.raw(f"""
auto k_ptr = reinterpret_cast<{k_type}*>(keys.raw_data());
auto v_ptr = reinterpret_cast<{v_type}*>(values.raw_data());
{v_type} index = 0;
for (auto it = {map_name}.begin(); it != {map_name}.end(); ++it){{
if (index >= N){{
break;
}}
k_ptr[index] = it->first;
v_ptr[index] = it->second;
++index;
}}
""")
if not CUMM_CPU_ONLY_BUILD:
with code.else_():
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch_int<4, 8>(keys_data.itemsize(), [&](auto IK){{
constexpr int IKV = TV_DECLTYPE(IK)::value;
using K = tv::hash::itemsize_to_unsigned_t<IKV>;
auto count_ptr = count.data_ptr<K>();
constexpr K kEmptyKey = std::numeric_limits<K>::max();
K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
K* key_ptr = reinterpret_cast<K*>(keys.raw_data());
tv::dispatch_int<4, 8>(values_data.itemsize(), [&](auto IV){{
constexpr int IVV = TV_DECLTYPE(IV)::value;
using V = tv::hash::itemsize_to_unsigned_t<IVV>;
V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
V* value_ptr = reinterpret_cast<V*>(values.raw_data());
using table_t =
tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
kEmptyKey, false>;
tv::cuda::Launch launcher(N, custream);
table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
launcher(tv::hash::iterate_table_split<table_t, K>, table, key_ptr, value_ptr, size_t(N), count_ptr);
}});
}});
""")
else:
code.raw(f"""
TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
""")
return code
def cpu_map_storage_select(self, k_itemsize: str, v_itemsize: str, res_var: str, code: pccm.FunctionCode):
different_kvs = [(4, 4), (4, 8), (8, 4), (8, 8)]
item_size_to_dtype = {
4: "uint32_t",
8: "uint64_t",
}
with code.block(""):
code.raw("bool found = false;")
for kit, vit in different_kvs:
with code.if_(f"{k_itemsize} == {kit} && {v_itemsize} == {vit}"):
code.raw(f"auto& {res_var} = map_{kit}_{vit};")
yield item_size_to_dtype[kit], item_size_to_dtype[vit]
code.raw(f"found = true;")
code.raw("TV_ASSERT_RT_ERR(found, \"suitable hash table not found.\");")
......@@ -3,6 +3,7 @@ from pathlib import Path
import numpy as np
import torch
from spconv.pytorch.core import SparseConvTensor
from spconv.pytorch import functional, ops
from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
SparseConv4d, SparseConvTranspose1d,
......@@ -11,7 +12,6 @@ from spconv.pytorch.conv import (SparseConv1d, SparseConv2d, SparseConv3d,
SparseInverseConv2d, SparseInverseConv3d,
SparseInverseConv4d, SubMConv1d, SubMConv2d,
SubMConv3d, SubMConv4d)
from spconv.pytorch.core import SparseConvTensor
from spconv.pytorch.identity import Identity
from spconv.pytorch.modules import (SparseModule, SparseSequential,
assign_name_for_sparse_modules)
......
......@@ -18,7 +18,6 @@ import numpy as np
import torch
from spconv.core import ConvAlgo
from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.pytorch.ops import ThrustSortAllocator
from spconv.tools import CUDAKernelTimer
if PYTORCH_VERSION >= [1, 8, 0]:
......@@ -39,6 +38,24 @@ else:
pass
class ThrustSortAllocator:
def __init__(self, device: torch.device) -> None:
super().__init__()
self.alloced_objs = {}
self.device = device
def alloc(self, n: int):
if n in self.alloced_objs:
return self.alloced_objs[n].data_ptr()
for n_cur, ten in self.alloced_objs.items():
if n < n_cur:
return ten.data_ptr()
ten = torch.empty([n], dtype=torch.uint8, device=self.device)
self.alloced_objs[n] = ten
return ten.data_ptr()
class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo):
......
......@@ -46,6 +46,9 @@ def torch_tensor_to_tv(ten: torch.Tensor,
dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, dtype, tv_device)
def torch_tensors_to_tv(*tens: torch.Tensor):
return (torch_tensor_to_tv(t) for t in tens)
def get_current_stream():
return torch.cuda.current_stream().cuda_stream
......
......@@ -20,15 +20,18 @@ from torch import nn
from torch.autograd import Function
from typing import Optional, TypeVar
from spconv.tools import CUDAKernelTimer
from spconv.pytorch import ops
from spconv.pytorch import ops, SparseConvTensor
from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.debug_utils import spconv_save_debug_data
from torch.autograd.function import once_differentiable
import numpy as np
from pathlib import Path
from spconv.pytorch.hash import HashTable
from cumm.gemm.layout import to_stride
from typing import List
_MAX_INT32 = 2147483647
_T = TypeVar("_T")
def identity_decorator(func: _T) -> _T:
......@@ -357,3 +360,69 @@ indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply
indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):
assert indices.shape[1] == len(shape)
stride = to_stride(np.array(shape, dtype=np.int64))
scalar_inds = indices[:, -1]
for i in range(len(shape) - 1):
scalar_inds += stride[i] * indices[:, i]
return scalar_inds.contiguous()
def sparse_add_hash_based(*tens: SparseConvTensor):
table_size = 0
for ten in tens:
assert ten.spatial_shape == tens[0].spatial_shape
assert ten.batch_size == tens[0].batch_size
assert ten.features.shape[1] == tens[0].features.shape[1]
table_size += ten.features.shape[0]
first = tens[0]
feat = first.features
shape = [first.batch_size, *first.spatial_shape]
whole_shape = int(np.prod(shape))
table_size *= 2
k_type = torch.int32
if whole_shape >= _MAX_INT32:
k_type = torch.int64
table = HashTable(first.features.device, k_type, torch.int32, table_size)
scalars: List[torch.Tensor] = []
for ten in tens:
indices = ten.indices
if whole_shape >= _MAX_INT32:
indices = indices.long()
scalar = _indice_to_scalar(indices, shape)
scalars.append(scalar)
table.insert(scalar)
# assign arange to values of hash table
count = table.assign_arange_()
count_val = count.item()
out_features = torch.zeros([int(count_val), feat.shape[1]], dtype=feat.dtype, device=feat.device)
out_indices = torch.zeros([int(count_val), first.indices.shape[1]], dtype=first.indices.dtype, device=first.indices.device)
for ten, scalar in zip(tens, scalars):
out_inds, _ = table.query(scalar)
out_inds = out_inds.long()
out_features[out_inds] += ten.features
out_indices[out_inds] = ten.indices
res = SparseConvTensor(out_features, out_indices, first.spatial_shape, first.batch_size,
benchmark=first.benchmark)
res.benchmark_record = first.benchmark_record
res._timer = first._timer
res.thrust_allocator = first.thrust_allocator
return res
def sparse_add(a: SparseConvTensor, b: SparseConvTensor):
assert a.spatial_shape == b.spatial_shape
assert a.batch_size == b.batch_size
assert a.features.shape[1] == a.features.shape[1]
res_shape = [a.batch_size, *a.spatial_shape, a.features.shape[1]]
a_th = torch.sparse_coo_tensor(a.indices.T, a.features, res_shape, requires_grad=True)
b_th = torch.sparse_coo_tensor(b.indices.T, b.features, res_shape, requires_grad=True)
c_th = (a_th + b_th).coalesce()
c_th_inds = c_th.indices().T.contiguous().int()
c_th_values = c_th.values()
assert c_th_values.is_contiguous()
return SparseConvTensor(c_th_values, c_th_inds, a.spatial_shape, a.batch_size)
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import torch
from cumm import tensorview as tv
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.hash.core import HashTable as _HashTable
_TORCH_DTYPE_TO_ITEMSIZE = {
torch.int32: 4,
torch.int64: 8,
torch.float32: 4,
torch.float64: 8,
}
class HashTable:
"""simple hash table for 32 and 64 bit data. support both cpu and cuda.
for cuda, it's a fixed-size table, you must provide maximum size
(recommend 2 * num).
see spconv/pytorch/functional/sparse_add_hash_based, a real example
that show how to use hash table to implement
sparse add (same shape, different indices)
"""
def __init__(self, device: torch.device, key_dtype: torch.dtype,
value_dtype: torch.dtype,
max_size: int = -1) -> None:
is_cpu = device.type == "cpu"
self.is_cpu = is_cpu
self.key_dtype = key_dtype
self.value_dtype = value_dtype
key_data_tv = tv.Tensor()
value_data_tv = tv.Tensor()
if is_cpu:
self.keys_data = None
self.values_data = None
else:
assert max_size > 0, "you must provide max_size for fixed-size cuda hash table, usually *2 of num of keys"
assert device is not None, "you must specify device for cuda hash table."
self.keys_data = torch.empty([max_size], dtype=key_dtype, device=device)
self.values_data = torch.empty([max_size], dtype=value_dtype, device=device)
key_data_tv = torch_tensor_to_tv(self.keys_data)
value_data_tv = torch_tensor_to_tv(self.values_data)
stream = 0
if not self.is_cpu:
stream = get_current_stream()
self.key_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.key_dtype]
self.value_itemsize = _TORCH_DTYPE_TO_ITEMSIZE[self.value_dtype]
self._valid_value_dtype_for_arange = set([torch.int32, torch.int64])
self._table = _HashTable(is_cpu, self.key_itemsize, self.value_itemsize, key_data_tv, value_data_tv, stream)
def insert(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
"""insert hash table by keys and values
if values is None, only key is inserted, the value is undefined.
"""
keys_tv = torch_tensor_to_tv(keys)
values_tv = tv.Tensor()
if values is not None:
values_tv = torch_tensor_to_tv(values)
stream = 0
if not self.is_cpu:
stream = get_current_stream()
return self._table.insert(keys_tv, values_tv, stream)
def query(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
keys_tv = torch_tensor_to_tv(keys)
if values is None:
values = torch.empty([keys.shape[0]], dtype=self.value_dtype, device=keys.device)
values_tv = torch_tensor_to_tv(values)
stream = 0
if not self.is_cpu:
stream = get_current_stream()
is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
is_empty_tv = torch_tensor_to_tv(is_empty)
self._table.query(keys_tv, values_tv, is_empty_tv, stream)
return values, is_empty
def assign_arange_(self):
count_tv = tv.Tensor()
count = torch.Tensor()
stream = 0
if not self.is_cpu:
stream = get_current_stream()
else:
assert self.value_dtype in self._valid_value_dtype_for_arange
if not self.is_cpu:
assert self.values_data is not None
if self.key_itemsize == 4:
count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
elif self.key_itemsize == 8:
count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
else:
raise NotImplementedError
else:
max_size = self._table.size_cpu()
count = torch.tensor([max_size], dtype=torch.int64)
self._table.assign_arange_(count_tv, stream)
return count
def items(self, max_size: int = -1):
count_tv = tv.Tensor()
count = torch.Tensor()
stream = 0
if not self.is_cpu:
stream = get_current_stream()
if not self.is_cpu:
assert self.values_data is not None
if self.key_itemsize == 4:
count = torch.zeros([1], dtype=torch.int32, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint32)
elif self.key_itemsize == 8:
count = torch.zeros([1], dtype=torch.int64, device=self.values_data.device)
count_tv = torch_tensor_to_tv(count, dtype=tv.uint64)
else:
raise NotImplementedError
if not self.is_cpu:
assert self.values_data is not None
if max_size == -1:
max_size = self.values_data.shape[0]
keys = torch.empty([max_size], dtype=self.key_dtype, device=self.values_data.device)
values = torch.empty([max_size], dtype=self.value_dtype, device=self.values_data.device)
else:
max_size = self._table.size_cpu()
count = torch.tensor([max_size], dtype=torch.int64)
keys = torch.empty([max_size], dtype=self.key_dtype)
values = torch.empty([max_size], dtype=self.value_dtype)
keys_tv = torch_tensor_to_tv(keys)
values_tv = torch_tensor_to_tv(values)
self._table.items(keys_tv, values_tv, count_tv, stream)
return keys, values, count
def main():
is_cpus = [True, False]
max_size = 1000
k_dtype = torch.int32
v_dtype = torch.int64
for is_cpu in is_cpus:
if is_cpu:
dev = torch.device("cpu")
table = HashTable(dev, k_dtype, v_dtype)
else:
dev = torch.device("cuda:0")
table = HashTable(dev, k_dtype, v_dtype, max_size=max_size)
keys = torch.tensor([5, 3, 7, 4, 6, 2, 10, 8], dtype=k_dtype, device=dev)
values = torch.tensor([1, 6, 4, 77, 23, 756, 12, 12], dtype=v_dtype, device=dev)
keys_query = torch.tensor([8, 10, 2, 6, 4, 7, 3, 5], dtype=k_dtype, device=dev)
table.insert(keys, values)
vq, _ = table.query(keys_query)
print(vq)
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
table.assign_arange_()
ks, vs, cnt = table.items()
cnt_item = cnt.item()
print(cnt, ks[:cnt_item], vs[:cnt_item])
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -22,6 +22,7 @@ import numpy as np
import spconv
from spconv.core import AlgoHint, ConvAlgo
from typing import List, Optional, Union
from spconv.pytorch.core import ThrustSortAllocator
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.sparse.all import SpconvOps
import spconv.core_cc as _ext
......@@ -43,24 +44,6 @@ from spconv.tools import CUDAKernelTimer
DEBUG = False
class ThrustSortAllocator:
def __init__(self, device: torch.device) -> None:
super().__init__()
self.alloced_objs = {}
self.device = device
def alloc(self, n: int):
if n in self.alloced_objs:
return self.alloced_objs[n].data_ptr()
for n_cur, ten in self.alloced_objs.items():
if n < n_cur:
return ten.data_ptr()
ten = torch.empty([n], dtype=torch.uint8, device=self.device)
self.alloced_objs[n] = ten
return ten.data_ptr()
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
......@@ -1482,3 +1465,4 @@ def indice_maxpool_implicit_gemm_backward(features, out_features, out_bp,
out_bp_tv, din_tv,
indice_pairs_tv, stream)
return din
......@@ -19,37 +19,68 @@ from torch.autograd import Function
from spconv.pytorch.modules import SparseModule
from spconv.pytorch.core import SparseConvTensor
from typing import List
from spconv.pytorch import functional as F
class JoinTable(SparseModule): # Module):
class JoinTable(SparseModule):
def forward(self, input: List[SparseConvTensor]):
msg = "you can't use JoinTable in two sptensor with different indices."
for ten in input:
assert ten.spatial_shape == input[0].spatial_shape, msg
assert ten.batch_size == input[0].batch_size, msg
assert ten.features.shape[1] == input[0].features.shape[1], msg
assert ten.indices.shape[0] == input[0].indices.shape[0], msg
output = SparseConvTensor(torch.cat([i.features for i in input], 1),
input[0].indices, input[0].spatial_shape,
input[0].batch_size, input[0].grid,
input[0].voxel_num, input[0].indice_dict)
output.benchmark_record = input[1].benchmark_record
output.thrust_allocator = input[1].thrust_allocator
output._timer = input[1]._timer
return output
def input_spatial_size(self, out_size):
return out_size
class AddTable(SparseModule): # Module):
class AddTable(SparseModule):
def forward(self, input: List[SparseConvTensor]):
msg = "you can't use AddTable in two sptensor with different indices. use AddTableMisaligned instead."
for ten in input:
assert ten.spatial_shape == input[0].spatial_shape, msg
assert ten.batch_size == input[0].batch_size, msg
assert ten.features.shape[1] == input[0].features.shape[1], msg
assert ten.indices.shape[0] == input[0].indices.shape[0], msg
output = SparseConvTensor(sum([i.features for i in input]),
input[0].indices, input[0].spatial_shape,
input[0].batch_size, input[0].grid,
input[0].voxel_num, input[0].indice_dict)
output.benchmark_record = input[1].benchmark_record
output.thrust_allocator = input[1].thrust_allocator
output._timer = input[1]._timer
return output
def input_spatial_size(self, out_size):
return out_size
class AddTableMisaligned(SparseModule):
"""add sptensors with same shape but different indices.
slower than AddTable.
WARNING: you shouldn't use this in segmentation network such as U-Net
because add misaligned tensors will clear downsample indices and make
SparseInverseConvXd not working.
"""
def forward(self, input: List[SparseConvTensor]):
return F.sparse_add_hash_based(*input)
def input_spatial_size(self, out_size):
return out_size
class ConcatTable(SparseModule): # Module):
class ConcatTable(SparseModule):
def forward(self, input):
return [module(input) for module in self._modules.values()]
......
......@@ -140,7 +140,6 @@ class PointToVoxel(object):
num_voxels = res[0].shape[0]
else:
pc_tv = torch_tensor_to_tv(pc)
stream = get_current_stream()
voxels_tv = torch_tensor_to_tv(self.voxels)
indices_tv = torch_tensor_to_tv(self.indices)
num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)
......
2.1.13
2.1.14
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment