Commit 8b5d2af0 authored by yan.yan's avatar yan.yan
Browse files

v2.1.13: better debug,add some spconv 1.x ops

parent fe4a2e61
...@@ -59,6 +59,7 @@ jobs: ...@@ -59,6 +59,7 @@ jobs:
env: env:
CUDA_VERSION: ${{ matrix.cuda-version }} CUDA_VERSION: ${{ matrix.cuda-version }}
PYTHON_VERSION: ${{ matrix.python-version }} PYTHON_VERSION: ${{ matrix.python-version }}
BOOST_VERSION: boost_1_77_0
if: | if: |
(env.CUDA_VERSION != '') && ( (env.CUDA_VERSION != '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
...@@ -72,6 +73,11 @@ jobs: ...@@ -72,6 +73,11 @@ jobs:
$Env:CUMM_CUDA_ARCH_LIST = "all" $Env:CUMM_CUDA_ARCH_LIST = "all"
$Env:SPCONV_DISABLE_JIT = "1" $Env:SPCONV_DISABLE_JIT = "1"
pip install pccm pybind11 pip install pccm pybind11
# download boost header only
$ProgressPreference = 'SilentlyContinue'
Invoke-WebRequest -Uri https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/$BOOST_VERSION.zip -UseBasicParsing -OutFile $HOME/boost.zip
Expand-Archive $HOME/boost.zip -DestinationPath $HOME/boost
$Env:BOOST_ROOT = "$HOME/boost/$BOOST_VERSION"
# ls "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${{ matrix.cuda-version }}\include\thrust" # ls "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${{ matrix.cuda-version }}\include\thrust"
python -m build --wheel --outdir dist/ . python -m build --wheel --outdir dist/ .
shell: powershell shell: powershell
...@@ -111,10 +117,15 @@ jobs: ...@@ -111,10 +117,15 @@ jobs:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install pep build - name: Install pep build
env:
BOOST_VERSION: boost_1_77_0
run: | run: |
python -m pip install build --user python -m pip install build --user
python -m pip install --upgrade pip twine wheel python -m pip install --upgrade pip twine wheel
python -m pip install pytest setuptools python -m pip install pytest setuptools
mkdir -p third_party
wget https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/$BOOST_VERSION.zip -O third_party/boost.zip
unzip third_party/boost.zip -d third_party/boost
- name: Build a cuda wheel - name: Build a cuda wheel
env: env:
...@@ -122,6 +133,7 @@ jobs: ...@@ -122,6 +133,7 @@ jobs:
PYTHON_VERSION: ${{ matrix.python-version }} PYTHON_VERSION: ${{ matrix.python-version }}
DOCKER_IMAGE: scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel-1.0.0 DOCKER_IMAGE: scrin/manylinux2014-cuda:cu${{ matrix.cuda-version }}-devel-1.0.0
PLAT: manylinux2014_x86_64 PLAT: manylinux2014_x86_64
BOOST_VERSION: boost_1_77_0
if: | if: |
(env.CUDA_VERSION != '') && ( (env.CUDA_VERSION != '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
...@@ -132,7 +144,10 @@ jobs: ...@@ -132,7 +144,10 @@ jobs:
) )
run: | run: |
chmod +x tools/build-wheels.sh chmod +x tools/build-wheels.sh
docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} -v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh" docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} \
-e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} \
-e BOOST_ROOT=/io/third_party/boost/$BOOST_VERSION \
-v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh"
- name: Build a cpu wheel - name: Build a cpu wheel
env: env:
...@@ -140,6 +155,7 @@ jobs: ...@@ -140,6 +155,7 @@ jobs:
PYTHON_VERSION: ${{ matrix.python-version }} PYTHON_VERSION: ${{ matrix.python-version }}
DOCKER_IMAGE: scrin/manylinux2014-cuda:cu114-devel-1.0.0 DOCKER_IMAGE: scrin/manylinux2014-cuda:cu114-devel-1.0.0
PLAT: manylinux2014_x86_64 PLAT: manylinux2014_x86_64
BOOST_VERSION: boost_1_77_0
if: | if: |
(env.CUDA_VERSION == '') && ( (env.CUDA_VERSION == '') && (
(github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) ) ||
...@@ -150,7 +166,10 @@ jobs: ...@@ -150,7 +166,10 @@ jobs:
) )
run: | run: |
chmod +x tools/build-wheels.sh chmod +x tools/build-wheels.sh
docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} -v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh" docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} \
-e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} \
-e BOOST_ROOT=/io/third_party/boost/$BOOST_VERSION \
-v `pwd`:/io $DOCKER_IMAGE bash -c "source /etc/bashrc && /io/tools/build-wheels.sh"
- name: Publish a Python distribution to PyPI - name: Publish a Python distribution to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
......
# Changelog # Changelog
## [2.1.13] - 2021-?-?
### Added
- Add some ops from spconv 1.x, see spconv.utils for more details.
- Add some debug tool for users to attach more info in issue.
## [2.1.12] - 2021-11-23 ## [2.1.12] - 2021-11-23
### Added ### Added
- Add a method for voxel generator to get pc_voxel_id, which is usually used in semantic segmentation - Add a method for voxel generator to get pc_voxel_id, which is usually used in semantic segmentation
......
...@@ -61,7 +61,7 @@ Spconv 1.x users **NEED READ [THIS](docs/SPCONV_2_BREAKING_CHANGEs.md)** before ...@@ -61,7 +61,7 @@ Spconv 1.x users **NEED READ [THIS](docs/SPCONV_2_BREAKING_CHANGEs.md)** before
* fp32 (not tf32) training/inference speed is increased (+50~80%) * fp32 (not tf32) training/inference speed is increased (+50~80%)
* fp16 training/inference speed is greatly increased when your layer support tensor core (channel size must be multiple of 8). * fp16 training/inference speed is greatly increased when your layer support tensor core (channel size must be multiple of 8).
* int8 op is ready, but we still need some time to figure out how to run int8 in pytorch. * int8 op is ready, but we still need some time to figure out how to run int8 in pytorch.
* [doesn't depend on pytorch binary](docs/FAQ.md#What-does-no-dependency-on-pytorch-mean), but you may need at least pytorch >= 1.6.0 to run spconv 2.x. * [doesn't depend on pytorch binary](docs/FAQ.md#What-does-no-dependency-on-pytorch-mean), but you may need at least pytorch >= 1.5.0 to run spconv 2.x.
* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference. * since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
## Spconv 2.x Development and Roadmap ## Spconv 2.x Development and Roadmap
...@@ -108,18 +108,32 @@ CUDA 11.1 will be removed in spconv 2.2 because pytorch 1.10 don't provide prebu ...@@ -108,18 +108,32 @@ CUDA 11.1 will be removed in spconv 2.2 because pytorch 1.10 don't provide prebu
```pip install spconv-cu114``` for CUDA 11.4 ```pip install spconv-cu114``` for CUDA 11.4
**NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) **in Linux**. for example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed. **NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) in **CUDA >= 11.0** because of [CUDA Minor Version Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility). For example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed.
For CUDA 10, we don't know whether ```spconv-cu102``` works with CUDA 10.0 and 10.1. Users can have a try.
**NOTE** In Linux, you can install spconv-cuxxx without install CUDA to system! only suitable NVIDIA driver is required. for CUDA 11, we need driver >= 450.82. **NOTE** In Linux, you can install spconv-cuxxx without install CUDA to system! only suitable NVIDIA driver is required. for CUDA 11, we need driver >= 450.82.
#### Prebuilt GPU Support Matrix
See [this page](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) to check supported GPU names by arch.
| CUDA version | GPU Arch List |
| -------------- |:---------------------:|
| 10.2 | 50,52,60,61,70,75 |
| 11.x | 52,60,61,70,75,80,86 |
| 12.x | 60,61,70,75,80,86,90 |
### Build from source for development (JIT, recommend) ### Build from source for development (JIT, recommend)
The c++ code will be built automatically when you change c++ code in project. The c++ code will be built automatically when you change c++ code in project.
For NVIDIA Embedded Platforms, you need to specify cuda arch before build: ```export CUMM_CUDA_ARCH_LIST="7.2"``` for xavier. For NVIDIA Embedded Platforms, you need to specify cuda arch before build: ```export CUMM_CUDA_ARCH_LIST="7.2"``` for xavier, ```export CUMM_CUDA_ARCH_LIST="6.2"``` for TX2, ```export CUMM_CUDA_ARCH_LIST="8.7"``` for orin.
You need to remove ```cumm``` in ```requires``` section in pyproject.toml after install editable ```cumm``` and before install spconv due to pyproject limit (can't find editable installed ```cumm```). You need to remove ```cumm``` in ```requires``` section in pyproject.toml after install editable ```cumm``` and before install spconv due to pyproject limit (can't find editable installed ```cumm```).
You need to ensure ```pip list | grep spconv``` and ```pip list | grep cumm``` show nothing before install editable spconv/cumm.
#### Linux #### Linux
0. uninstall spconv and cumm installed by pip 0. uninstall spconv and cumm installed by pip
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
## Simple Benchmark ## Simple Benchmark
### Network Benchmark without batchnorm (F32/F16) in RTX 3080 Laptop GPU ### Network Benchmark without batchnorm (F32/F16) in RTX 3080 Laptop GPU 150W
Network Code: test/benchmark.py Network Code: test/benchmark.py
...@@ -25,6 +25,18 @@ Network Code: test/benchmark.py ...@@ -25,6 +25,18 @@ Network Code: test/benchmark.py
| Forward | 43ms | 21.7ms/13.7ms | 23.5ms/11.2ms | 22ms/12.2ms | | Forward | 43ms | 21.7ms/13.7ms | 23.5ms/11.2ms | 22ms/12.2ms |
| Backward | 80ms | 41.9ms/25.2ms | 51.0ms/13.8ms | 41.1ms/12.2ms | | Backward | 80ms | 41.9ms/25.2ms | 51.0ms/13.8ms | 41.1ms/12.2ms |
| F16 Forward | Native| Implicit Gemm | Implicit Gemm Split Mask |
| -------------- |:---------------------:|---------------------:| ---------------------:|
| RTX 3080 Laptop 150W | 13.7ms | 11.2ms | 12.2ms |
| RTX A6000 | 19.1ms | 11.7ms | 14.0ms |
| TESLA V100 | 17.9ms | 11.4ms | 13.4ms |
| F16 Backward | Native| Implicit Gemm | Implicit Gemm Split Mask |
| -------------- |:---------------------:|---------------------:| ---------------------:|
| RTX 3080 Laptop 150W | 25.2ms | 13.8ms | 12.2ms |
| RTX A6000 | 28.1ms | 9.2ms | 8.9ms |
| TESLA V100 | 33.9ms | 12.2ms | 12.9ms |
### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU ### Network Gemm Kernel Benchmark FP16 in RTX 3080 Laptop GPU
Network Code: test/benchmark.py Network Code: test/benchmark.py
......
...@@ -26,30 +26,3 @@ ...@@ -26,30 +26,3 @@
* spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible. * spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible.
See [benchmark](BENCHMARK.md) for more performance details of different algorithms. See [benchmark](BENCHMARK.md) for more performance details of different algorithms.
## Algorithm Overview
### Native Explicit (deprecated and removed in spconv 2.x)
native algorithm (explicit, no fused) is standard gather-gemm-scatter algorithm. Assume we compute 3x3 conv, We can split it to 9 of 1x1 conv which can be computed by matmul, then sum them to get final result.
For sparse convolution, we also do split-gemm-sum to calculate conv, but we need to collect data first because it's sparse.
### Native
Fused version of above algorithm. 1.5x-2x faster than non-fused version.
### Implicit Gemm
```Native``` algorithm do minimal mma (matrix multiply add), but it need to serialize IO. The pipeline of ```Native``` is gather-gemm-scatter-gather-gemm-scatter-...
```Implicit Gemm``` fuse all calculation to one kernel and perform overlapped gather-mma-scatter to save a lot of time.
![Image Overlapped Gemm](https://raw.githubusercontent.com/NVIDIA/cutlass/master/media/images/software-pipeline.png)
In my test, ```Implicit Gemm``` is almost 2x faster than ```Native```.
### Implicit Gemm Split Mask
TODO
In my test, ```Implicit Gemm Split Mask``` is slightly faster than ```Implicit Gemm```, but the indice generation is slower, so currently we use ```Implicit Gemm``` by default.
\ No newline at end of file
...@@ -156,6 +156,8 @@ if disable_jit is not None and disable_jit == "1": ...@@ -156,6 +156,8 @@ if disable_jit is not None and disable_jit == "1":
from cumm.conv.main import ConvMainUnitTest from cumm.conv.main import ConvMainUnitTest
from cumm.constants import CUMM_CPU_ONLY_BUILD from cumm.constants import CUMM_CPU_ONLY_BUILD
from spconv.csrc.sparse.all import SpconvOps from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.utils import BoxOps
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS) cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS) convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
convcu.namespace = "cumm.conv.main" convcu.namespace = "cumm.conv.main"
...@@ -168,9 +170,9 @@ if disable_jit is not None and disable_jit == "1": ...@@ -168,9 +170,9 @@ if disable_jit is not None and disable_jit == "1":
std = "c++14" std = "c++14"
else: else:
std = "c++17" std = "c++17"
cus = [cu, convcu, SpconvOps()] cus = [cu, convcu, SpconvOps(), BoxOps()]
if CUMM_CPU_ONLY_BUILD: if CUMM_CPU_ONLY_BUILD:
cus = [SpconvOps()] cus = [SpconvOps(), BoxOps()]
ext_modules: List[Extension] = [ ext_modules: List[Extension] = [
PCCMExtension(cus, PCCMExtension(cus,
"spconv/core_cc", "spconv/core_cc",
......
...@@ -28,6 +28,8 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -28,6 +28,8 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from cumm.conv.main import ConvMainUnitTest from cumm.conv.main import ConvMainUnitTest
from spconv.csrc.sparse.all import SpconvOps from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.utils import BoxOps
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS +
SHUFFLE_TURING_PARAMS) SHUFFLE_TURING_PARAMS)
cu.namespace = "cumm.gemm.main" cu.namespace = "cumm.gemm.main"
...@@ -38,7 +40,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -38,7 +40,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
if InWindows: if InWindows:
# windows have command line limit, so we use objects_folder to reduce command size. # windows have command line limit, so we use objects_folder to reduce command size.
objects_folder = "objects" objects_folder = "objects"
pccm.builder.build_pybind([cu, convcu, SpconvOps()], pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps()],
PACKAGE_ROOT / "core_cc", PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT, namespace_root=PACKAGE_ROOT,
objects_folder=objects_folder, objects_folder=objects_folder,
......
...@@ -27,3 +27,15 @@ _filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0") ...@@ -27,3 +27,15 @@ _filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0")
FILTER_HWIO = _filter_hwio_env == "1" FILTER_HWIO = _filter_hwio_env == "1"
DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1" DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
NDIM_DONT_CARE = 3 NDIM_DONT_CARE = 3
SPCONV_DEBUG_SAVE_PATH = os.getenv("SPCONV_DEBUG_SAVE_PATH", "")
_BOOST_ROOT = os.getenv("BOOST_ROOT", None)
if _BOOST_ROOT is None:
BOOST_ROOT = None
else:
BOOST_ROOT = Path(_BOOST_ROOT)
assert BOOST_ROOT.exists(), "you provide BOOST_ROOT, but it not exists"
assert (BOOST_ROOT / "boost" / "geometry").exists(), "you provide BOOST_ROOT, but BOOST_ROOT/boost/geometry not exists"
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class BoxOps:
@staticmethod
def has_boost() -> bool: ...
@staticmethod
def non_max_suppression_cpu(boxes: Tensor, order: Tensor, thresh: float, eps: float = 0) -> List[int]:
"""
Args:
boxes:
order:
thresh:
eps:
"""
...
@staticmethod
def rotate_non_max_suppression_cpu(box_corners: Tensor, order: Tensor, standup_iou: Tensor, thresh: float, eps: float = 0) -> List[int]:
"""
Args:
box_corners:
order:
standup_iou:
thresh:
eps:
"""
...
@staticmethod
def rbbox_iou(box_corners: Tensor, qbox_corners: Tensor, standup_iou: Tensor, overlaps: Tensor, standup_thresh: float, inter_only: bool) -> None:
"""
Args:
box_corners:
qbox_corners:
standup_iou:
overlaps:
standup_thresh:
inter_only:
"""
...
@staticmethod
def rbbox_iou_aligned(box_corners: Tensor, qbox_corners: Tensor, overlaps: Tensor, inter_only: bool) -> None:
"""
Args:
box_corners:
qbox_corners:
overlaps:
inter_only:
"""
...
...@@ -18,3 +18,7 @@ if hasattr(_ext, "cumm"): ...@@ -18,3 +18,7 @@ if hasattr(_ext, "cumm"):
CPU_ONLY_BUILD = False CPU_ONLY_BUILD = False
else: else:
CPU_ONLY_BUILD = True CPU_ONLY_BUILD = True
from spconv.core_cc.csrc.utils.boxops import BoxOps
HAS_BOOST = BoxOps.has_boost()
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .boxops import BoxOps
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pccm
from pathlib import Path
import os
from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
from spconv.constants import BOOST_ROOT
class BoostGeometryLib(pccm.Class):
def __init__(self):
super().__init__()
assert BOOST_ROOT is not None
self.build_meta.add_includes(BOOST_ROOT)
self.add_include("boost/geometry.hpp")
class BoxOps(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
@pccm.pybind.mark
@pccm.static_function
def has_boost(self):
code = pccm.FunctionCode()
code.raw(f"return {pccm.boolean(BOOST_ROOT is not None)};")
return code.ret("bool")
@pccm.pybind.mark(nogil=True)
@pccm.static_function
def non_max_suppression_cpu(self):
code = pccm.FunctionCode()
code.arg("boxes, order", "tv::Tensor")
code.arg("thresh", "float")
code.arg("eps", "float", "0")
code.raw(f"""
auto ndets = boxes.dim(0);
std::vector<int> keep(ndets);
tv::dispatch<float, double>(boxes.dtype(), [&](auto I1){{
using DType = TV_DECLTYPE(I1);
auto boxes_r = boxes.tview<const DType, 2>();
tv::dispatch<int, int64_t, uint32_t, uint64_t>(order.dtype(), [&](auto I2){{
using T2 = TV_DECLTYPE(I2);
auto order_r = order.tview<const T2, 1>();
std::vector<DType> areas;
for (int i = 0; i < ndets; ++i){{
areas[i] = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
(boxes_r(i, 3) - boxes_r(i, 1) + eps);
}}
std::vector<int> suppressed(ndets, 0);
int i, j;
DType xx1, xx2, w, h, inter, ovr;
for (int _i = 0; _i < ndets; ++_i) {{
i = order_r(_i);
if (suppressed[i] == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {{
j = order_r(_j);
if (suppressed[j] == 1)
continue;
xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
w = xx2 - xx1 + eps;
if (w > 0) {{
xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
h = xx2 - xx1 + eps;
if (h > 0) {{
inter = w * h;
ovr = inter / (areas[i] + areas[j] - inter);
if (ovr >= thresh)
suppressed[j] = 1;
}}
}}
}}
}}
}});
}});
return keep;
""")
return code.ret("std::vector<int>")
@pccm.pybind.mark(nogil=True)
@pccm.static_function
def rotate_non_max_suppression_cpu(self):
code = pccm.FunctionCode()
code.arg("box_corners, order, standup_iou", "tv::Tensor")
code.arg("thresh", "float")
code.arg("eps", "float", "0")
if BOOST_ROOT is None:
return code.make_invalid()
code.add_dependency(BoostGeometryLib)
code.raw(f"""
auto ndets = box_corners.dim(0);
std::vector<int> keep(ndets);
tv::dispatch<float, double>(box_corners.dtype(), [&](auto I1){{
using DType = TV_DECLTYPE(I1);
auto box_corners_r = box_corners.tview<const DType, 3>();
auto standup_iou_r = standup_iou.tview<const DType, 2>();
tv::dispatch<int, int64_t, uint32_t, uint64_t>(order.dtype(), [&](auto I2){{
using T2 = TV_DECLTYPE(I2);
auto order_r = order.tview<const T2, 1>();
std::vector<int> suppressed(ndets, 0);
int i, j;
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area, overlap;
for (int _i = 0; _i < ndets; ++_i) {{
i = order_r(_i);
if (suppressed[i] == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {{
j = order_r(_j);
if (suppressed[j] == 1)
continue;
if (standup_iou_r(i, j) <= 0.0)
continue;
// std::cout << "pre_poly" << std::endl;
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(poly,
point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
bg::append(poly,
point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
bg::append(poly,
point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {{
inter_area = bg::area(poly_inter.front());
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) {{ // ignore invalid box
union_area = bg::area(poly_union.front());
overlap = inter_area / union_area;
if (overlap >= thresh)
suppressed[j] = 1;
poly_union.clear();
}}
}}
poly.clear();
qpoly.clear();
poly_inter.clear();
}}
}}
}});
}});
return keep;
""")
return code.ret("std::vector<int>")
@pccm.pybind.mark(nogil=True)
@pccm.static_function
def rbbox_iou(self):
code = pccm.FunctionCode()
code.arg("box_corners, qbox_corners, standup_iou, overlaps", "tv::Tensor")
code.arg("standup_thresh", "float")
code.arg("inter_only", "bool")
if BOOST_ROOT is None:
return code.make_invalid()
code.add_dependency(BoostGeometryLib)
code.raw(f"""
auto N = box_corners.dim(0);
auto K = qbox_corners.dim(0);
if (N == 0 || K == 0) {{
return;
}}
tv::dispatch<float, double>(box_corners.dtype(), [&](auto I1){{
using DType = TV_DECLTYPE(I1);
auto box_corners_r = box_corners.tview<const DType, 3>();
auto qbox_corners_r = qbox_corners.tview<const DType, 3>();
auto standup_iou_r = standup_iou.tview<const DType, 2>();
auto overlaps_rw = overlaps.tview<DType, 2>();
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
for (int k = 0; k < K; ++k) {{
for (int n = 0; n < N; ++n) {{
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {{
inter_area = bg::area(poly_inter.front());
if (inter_only){{
overlaps_rw(n, k) = inter_area;
}}else{{
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) {{
union_area = bg::area(poly_union.front());
overlaps_rw(n, k) = inter_area / union_area;
}}
poly_union.clear();
}}
}}
poly.clear();
qpoly.clear();
poly_inter.clear();
}}
}}
}});
return;
""")
return code
@pccm.pybind.mark(nogil=True)
@pccm.static_function
def rbbox_iou_aligned(self):
code = pccm.FunctionCode()
code.arg("box_corners, qbox_corners, overlaps", "tv::Tensor")
code.arg("inter_only", "bool")
if BOOST_ROOT is None:
return code.make_invalid()
code.add_dependency(BoostGeometryLib)
code.raw(f"""
auto N = box_corners.dim(0);
auto K = qbox_corners.dim(0);
TV_ASSERT_RT_ERR(N == K, "aligned iou must have same number of box")
if (N == 0 || K == 0) {{
return;
}}
tv::dispatch<float, double>(box_corners.dtype(), [&](auto I1){{
using DType = TV_DECLTYPE(I1);
auto box_corners_r = box_corners.tview<const DType, 3>();
auto qbox_corners_r = qbox_corners.tview<const DType, 3>();
auto overlaps_rw = overlaps.tview<DType, 1>();
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
for (int n = 0; n < N; ++n) {{
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(n, 0, 0), qbox_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(n, 1, 0), qbox_corners_r(n, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(n, 2, 0), qbox_corners_r(n, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(n, 3, 0), qbox_corners_r(n, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(n, 0, 0), qbox_corners_r(n, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {{
inter_area = bg::area(poly_inter.front());
if (inter_only){{
overlaps_rw(n) = inter_area;
}}else{{
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) {{
union_area = bg::area(poly_union.front());
overlaps_rw(n) = inter_area / union_area;
}}
poly_union.clear();
}}
}}
poly.clear();
qpoly.clear();
poly_inter.clear();
}}
}});
return;
""")
return code
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
from pathlib import Path
from spconv.constants import SPCONV_DEBUG_SAVE_PATH
def spconv_save_debug_data(data):
if SPCONV_DEBUG_SAVE_PATH:
try:
save_path = Path(SPCONV_DEBUG_SAVE_PATH)
assert save_path.parent.exists(), "parent of SPCONV_DEBUG_SAVE_PATH must exist"
with save_path.open("wb") as f:
pickle.dump(data, f)
print((f"spconv save debug data to {SPCONV_DEBUG_SAVE_PATH}, "
"you can submit issue with log and this debug data attached."))
except Exception as e:
print((f"spconv try to save debug data to {SPCONV_DEBUG_SAVE_PATH}, "
f"but failed with exception {e}. please check your SPCONV_DEBUG_SAVE_PATH"))
else:
print((f"SPCONV_DEBUG_SAVE_PATH not found, "
"you can specify SPCONV_DEBUG_SAVE_PATH as debug data save path "
"to save debug data which can be attached in a issue."))
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import math import math
import time import time
import sys
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -24,6 +25,7 @@ from torch.nn.parameter import Parameter ...@@ -24,6 +25,7 @@ from torch.nn.parameter import Parameter
from spconv import pytorch as spconv from spconv import pytorch as spconv
from spconv.core import ConvAlgo from spconv.core import ConvAlgo
from spconv.debug_utils import spconv_save_debug_data
from spconv.pytorch import functional as Fsp from spconv.pytorch import functional as Fsp
from spconv.pytorch import ops from spconv.pytorch import ops
from spconv.cppconstants import CPU_ONLY_BUILD from spconv.cppconstants import CPU_ONLY_BUILD
...@@ -291,11 +293,21 @@ class SparseConvolution(SparseModule): ...@@ -291,11 +293,21 @@ class SparseConvolution(SparseModule):
if input.benchmark: if input.benchmark:
torch.cuda.synchronize() torch.cuda.synchronize()
t = time.time() t = time.time()
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs( try:
indices, batch_size, spatial_shape, algo, outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
self.kernel_size, self.stride, self.padding, indices, batch_size, spatial_shape, algo,
self.dilation, self.output_padding, self.subm, self.kernel_size, self.stride, self.padding,
self.transposed) self.dilation, self.output_padding, self.subm,
self.transposed)
except Exception as e:
msg = "[Exception|native_pair]"
msg += f"indices={indices.shape},bs={batch_size},ss={spatial_shape},"
msg += f"algo={algo},ksize={self.kernel_size},stride={self.stride},"
msg += f"padding={self.padding},dilation={self.dilation},subm={self.subm},"
msg += f"transpose={self.transposed}"
print(msg, file=sys.stderr)
spconv_save_debug_data(indices)
raise e
if input.benchmark: if input.benchmark:
torch.cuda.synchronize() torch.cuda.synchronize()
interval = time.time() - t interval = time.time() - t
...@@ -367,24 +379,36 @@ class SparseConvolution(SparseModule): ...@@ -367,24 +379,36 @@ class SparseConvolution(SparseModule):
mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits
masks = datas.masks masks = datas.masks
else: else:
with input._timer.namespace("gen_pairs"): with input._timer.namespace("gen_pairs"):
# we need to gen bwd indices for regular conv # we need to gen bwd indices for regular conv
# because it may be inversed. # because it may be inversed.
res = ops.get_indice_pairs_implicit_gemm( try:
indices, res = ops.get_indice_pairs_implicit_gemm(
batch_size, indices,
spatial_shape, batch_size,
algo, spatial_shape,
ksize=self.kernel_size, algo,
stride=self.stride, ksize=self.kernel_size,
padding=self.padding, stride=self.stride,
dilation=self.dilation, padding=self.padding,
out_padding=self.output_padding, dilation=self.dilation,
subm=self.subm, out_padding=self.output_padding,
transpose=self.transposed, subm=self.subm,
is_train=(not self.subm) or self.training, transpose=self.transposed,
alloc=input.thrust_allocator, is_train=(not self.subm) or self.training,
timer=input._timer) alloc=input.thrust_allocator,
timer=input._timer)
except Exception as e:
msg = "[Exception|implicit_gemm_pair]"
msg += f"indices={indices.shape},bs={batch_size},ss={spatial_shape},"
msg += f"algo={algo},ksize={self.kernel_size},stride={self.stride},"
msg += f"padding={self.padding},dilation={self.dilation},subm={self.subm},"
msg += f"transpose={self.transposed}"
print(msg, file=sys.stderr)
spconv_save_debug_data(indices)
raise e
outids = res[0] outids = res[0]
num_inds_per_loc = res[1] num_inds_per_loc = res[1]
pair_fwd = res[2] pair_fwd = res[2]
......
...@@ -12,6 +12,9 @@ ...@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
import pickle
import torch import torch
from torch import nn from torch import nn
from torch.autograd import Function from torch.autograd import Function
...@@ -19,9 +22,10 @@ from typing import Optional, TypeVar ...@@ -19,9 +22,10 @@ from typing import Optional, TypeVar
from spconv.tools import CUDAKernelTimer from spconv.tools import CUDAKernelTimer
from spconv.pytorch import ops from spconv.pytorch import ops
from spconv.pytorch.constants import PYTORCH_VERSION from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.debug_utils import spconv_save_debug_data
from torch.autograd.function import once_differentiable from torch.autograd.function import once_differentiable
import numpy as np import numpy as np
from pathlib import Path
from typing import List from typing import List
...@@ -53,14 +57,22 @@ class SparseConvFunction(Function): ...@@ -53,14 +57,22 @@ class SparseConvFunction(Function):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo ctx.algo = algo
ctx.timer = timer ctx.timer = timer
return ops.indice_conv(features, try:
filters, return ops.indice_conv(features,
indice_pairs, filters,
indice_pair_num, indice_pairs,
num_activate_out, indice_pair_num,
False, num_activate_out,
algo=algo, False,
timer=timer) algo=algo,
timer=timer)
except Exception as e:
msg = "[Exception|indice_conv]"
msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
msg += f"pairnum={indice_pair_num},act={num_activate_out},algo={algo}"
print(msg, file=sys.stderr)
spconv_save_debug_data((indice_pairs, indice_pair_num))
raise e
@staticmethod @staticmethod
@once_differentiable @once_differentiable
...@@ -68,15 +80,22 @@ class SparseConvFunction(Function): ...@@ -68,15 +80,22 @@ class SparseConvFunction(Function):
def backward(ctx, grad_output): def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
timer = ctx.timer timer = ctx.timer
try:
input_bp, filters_bp = ops.indice_conv_backward(features, input_bp, filters_bp = ops.indice_conv_backward(features,
filters, filters,
grad_output, grad_output,
indice_pairs, indice_pairs,
indice_pair_num, indice_pair_num,
False, False,
algo=ctx.algo, algo=ctx.algo,
timer=timer) timer=timer)
except Exception as e:
msg = "[Exception|indice_conv_backward]"
msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
msg += f"pairnum={indice_pair_num},do={grad_output.shape}"
print(msg, file=sys.stderr)
spconv_save_debug_data((indice_pairs, indice_pair_num))
raise e
return input_bp, filters_bp, None, None, None, None, None return input_bp, filters_bp, None, None, None, None, None
...@@ -95,16 +114,23 @@ class SparseInverseConvFunction(Function): ...@@ -95,16 +114,23 @@ class SparseInverseConvFunction(Function):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo ctx.algo = algo
ctx.timer = timer ctx.timer = timer
try:
return ops.indice_conv(features, return ops.indice_conv(features,
filters, filters,
indice_pairs, indice_pairs,
indice_pair_num, indice_pair_num,
num_activate_out, num_activate_out,
True, True,
False, False,
algo=algo, algo=algo,
timer=timer) timer=timer)
except Exception as e:
msg = "[Exception|indice_conv|inverse]"
msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
msg += f"pairnum={indice_pair_num},act={num_activate_out},algo={algo}"
print(msg, file=sys.stderr)
spconv_save_debug_data((indice_pairs, indice_pair_num))
raise e
@staticmethod @staticmethod
@once_differentiable @once_differentiable
...@@ -112,16 +138,23 @@ class SparseInverseConvFunction(Function): ...@@ -112,16 +138,23 @@ class SparseInverseConvFunction(Function):
def backward(ctx, grad_output): def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
timer = ctx.timer timer = ctx.timer
try:
input_bp, filters_bp = ops.indice_conv_backward(features, input_bp, filters_bp = ops.indice_conv_backward(features,
filters, filters,
grad_output, grad_output,
indice_pairs, indice_pairs,
indice_pair_num, indice_pair_num,
True, True,
False, False,
algo=ctx.algo, algo=ctx.algo,
timer=timer) timer=timer)
except Exception as e:
msg = "[Exception|indice_conv_backward|inverse]"
msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
msg += f"pairnum={indice_pair_num},do={grad_output.shape}"
print(msg, file=sys.stderr)
spconv_save_debug_data((indice_pairs, indice_pair_num))
raise e
return input_bp, filters_bp, None, None, None, None, None return input_bp, filters_bp, None, None, None, None, None
...@@ -143,13 +176,23 @@ class SparseImplicitGemmFunction(Function): ...@@ -143,13 +176,23 @@ class SparseImplicitGemmFunction(Function):
is_train: bool, is_train: bool,
is_subm: bool, is_subm: bool,
timer: CUDAKernelTimer = CUDAKernelTimer(False)): timer: CUDAKernelTimer = CUDAKernelTimer(False)):
try:
out, mask_out, mask_width = ops.implicit_gemm(features, filters, out, mask_out, mask_width = ops.implicit_gemm(features, filters,
pair_fwd, pair_fwd,
pair_mask_fwd_splits, pair_mask_fwd_splits,
mask_argsort_fwd_splits, mask_argsort_fwd_splits,
num_activate_out, masks, num_activate_out, masks,
is_train, is_subm, timer) is_train, is_subm, timer)
except Exception as e:
msg = "[Exception|implicit_gemm]"
msg += f"feat={features.shape},w={filters.shape},pair={pair_fwd.shape},"
msg += f"act={num_activate_out},issubm={is_subm},istrain={is_train}"
print(msg, file=sys.stderr)
spconv_save_debug_data((pair_fwd, pair_bwd, pair_mask_fwd_splits,
pair_mask_bwd_splits, mask_argsort_fwd_splits, mask_argsort_bwd_splits,
masks))
raise e
ctx.save_for_backward(features, filters, pair_fwd, pair_bwd) ctx.save_for_backward(features, filters, pair_fwd, pair_bwd)
ctx.mask_width = mask_width ctx.mask_width = mask_width
ctx.mask_out = mask_out ctx.mask_out = mask_out
...@@ -178,21 +221,32 @@ class SparseImplicitGemmFunction(Function): ...@@ -178,21 +221,32 @@ class SparseImplicitGemmFunction(Function):
masks = ctx.masks masks = ctx.masks
is_subm = ctx.is_subm is_subm = ctx.is_subm
timer = ctx.timer timer = ctx.timer
input_bp, filters_bp = ops.implicit_gemm_backward( try:
features, input_bp, filters_bp = ops.implicit_gemm_backward(
filters, features,
grad_output, filters,
pair_fwd, grad_output,
pair_bwd, pair_fwd,
pair_mask_fwd_splits, pair_bwd,
pair_mask_bwd_splits, pair_mask_fwd_splits,
mask_argsort_fwd_splits, pair_mask_bwd_splits,
mask_argsort_bwd_splits, mask_argsort_fwd_splits,
mask_output_fwd=mask_out, mask_argsort_bwd_splits,
masks=masks, mask_output_fwd=mask_out,
mask_width=mask_width, masks=masks,
is_subm=is_subm, mask_width=mask_width,
timer=timer) is_subm=is_subm,
timer=timer)
except Exception as e:
msg = "[Exception|implicit_gemm_backward]"
msg += f"feat={features.shape},w={filters.shape},pair={pair_fwd.shape},"
msg += f"issubm={is_subm},do={grad_output.shape}"
print(msg, file=sys.stderr)
spconv_save_debug_data((pair_fwd, pair_bwd, pair_mask_fwd_splits,
pair_mask_bwd_splits, mask_argsort_fwd_splits, mask_argsort_bwd_splits,
masks))
raise e
None_9 = [None] * 11 None_9 = [None] * 11
return (input_bp, filters_bp, *None_9) return (input_bp, filters_bp, *None_9)
...@@ -211,15 +265,23 @@ class SubMConvFunction(Function): ...@@ -211,15 +265,23 @@ class SubMConvFunction(Function):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo ctx.algo = algo
ctx.timer = timer ctx.timer = timer
return ops.indice_conv(features, try:
filters, return ops.indice_conv(features,
indice_pairs, filters,
indice_pair_num, indice_pairs,
num_activate_out, indice_pair_num,
False, num_activate_out,
True, False,
algo=algo, True,
timer=timer) algo=algo,
timer=timer)
except Exception as e:
msg = "[Exception|indice_conv|subm]"
msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
msg += f"pairnum={indice_pair_num},act={num_activate_out},algo={algo}"
print(msg, file=sys.stderr)
spconv_save_debug_data((indice_pairs, indice_pair_num))
raise e
@staticmethod @staticmethod
@once_differentiable @once_differentiable
...@@ -227,16 +289,24 @@ class SubMConvFunction(Function): ...@@ -227,16 +289,24 @@ class SubMConvFunction(Function):
def backward(ctx, grad_output): def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
timer = ctx.timer timer = ctx.timer
try:
input_bp, filters_bp = ops.indice_conv_backward(features,
filters,
grad_output,
indice_pairs,
indice_pair_num,
False,
True,
algo=ctx.algo,
timer=timer)
except Exception as e:
msg = "[Exception|indice_conv_backward|subm]"
msg += f"feat={features.shape},w={filters.shape},pair={indice_pairs.shape},"
msg += f"pairnum={indice_pair_num},do={grad_output.shape}"
print(msg, file=sys.stderr)
spconv_save_debug_data((indice_pairs, indice_pair_num))
raise e
input_bp, filters_bp = ops.indice_conv_backward(features,
filters,
grad_output,
indice_pairs,
indice_pair_num,
False,
True,
algo=ctx.algo,
timer=timer)
return input_bp, filters_bp, None, None, None, None, None return input_bp, filters_bp, None, None, None, None, None
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from typing import List from typing import List, Union
import torch import torch
from cumm import tensorview as tv from cumm import tensorview as tv
...@@ -158,12 +158,16 @@ class PointToVoxel(object): ...@@ -158,12 +158,16 @@ class PointToVoxel(object):
self.num_per_voxel[:num_voxels], pc_voxel_id) self.num_per_voxel[:num_voxels], pc_voxel_id)
def gather_features_by_pc_voxel_id(seg_res_features: torch.Tensor, pc_voxel_id: torch.Tensor): def gather_features_by_pc_voxel_id(seg_res_features: torch.Tensor, pc_voxel_id: torch.Tensor, invalid_value: Union[int, float] = 0):
"""This function is used to gather segmentation result to match origin pc. """This function is used to gather segmentation result to match origin pc.
""" """
if seg_res_features.device != pc_voxel_id.device: if seg_res_features.device != pc_voxel_id.device:
pc_voxel_id = pc_voxel_id.to(seg_res_features.device) pc_voxel_id = pc_voxel_id.to(seg_res_features.device)
res = torch.zeros((pc_voxel_id.shape[0], seg_res_features.shape[1]), dtype=seg_res_features.dtype, device=seg_res_features.device) res_feature_shape = (pc_voxel_id.shape[0], *seg_res_features.shape[1:])
if invalid_value == 0:
res = torch.zeros(res_feature_shape, dtype=seg_res_features.dtype, device=seg_res_features.device)
else:
res = torch.full(res_feature_shape, invalid_value, dtype=seg_res_features.dtype, device=seg_res_features.device)
pc_voxel_id_valid = pc_voxel_id != -1 pc_voxel_id_valid = pc_voxel_id != -1
pc_voxel_id_valid_ids = torch.nonzero(pc_voxel_id_valid).view(-1) pc_voxel_id_valid_ids = torch.nonzero(pc_voxel_id_valid).view(-1)
seg_res_features_valid = seg_res_features[pc_voxel_id[pc_voxel_id_valid_ids]] seg_res_features_valid = seg_res_features[pc_voxel_id[pc_voxel_id_valid_ids]]
......
...@@ -16,6 +16,7 @@ import numpy as np ...@@ -16,6 +16,7 @@ import numpy as np
from cumm import tensorview as tv from cumm import tensorview as tv
from contextlib import AbstractContextManager from contextlib import AbstractContextManager
from spconv.cppconstants import CPU_ONLY_BUILD from spconv.cppconstants import CPU_ONLY_BUILD
from spconv.core_cc.csrc.utils.boxops import BoxOps
from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
...@@ -47,3 +48,69 @@ class nullcontext(AbstractContextManager): ...@@ -47,3 +48,69 @@ class nullcontext(AbstractContextManager):
def __exit__(self, *excinfo): def __exit__(self, *excinfo):
pass pass
def rbbox_iou(box_corners: np.ndarray, qbox_corners: np.ndarray,
standup_iou: np.ndarray, standup_thresh: float):
if not BoxOps.has_boost():
raise NotImplementedError(
"this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
)
N = box_corners.shape[0]
K = qbox_corners.shape[0]
overlap = np.zeros((N, K), dtype=box_corners.dtype)
BoxOps.rbbox_iou(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners),
tv.from_numpy(standup_iou), tv.from_numpy(overlap),
standup_thresh, False)
return overlap
def rbbox_intersection(box_corners: np.ndarray, qbox_corners: np.ndarray,
standup_iou: np.ndarray, standup_thresh: float):
if not BoxOps.has_boost():
raise NotImplementedError(
"this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
)
N = box_corners.shape[0]
K = qbox_corners.shape[0]
overlap = np.zeros((N, K), dtype=box_corners.dtype)
BoxOps.rbbox_iou(tv.from_numpy(box_corners), tv.from_numpy(qbox_corners),
tv.from_numpy(standup_iou), tv.from_numpy(overlap),
standup_thresh, True)
return overlap
def rbbox_iou_loss(box_corners: np.ndarray, qbox_corners: np.ndarray):
if not BoxOps.has_boost():
raise NotImplementedError(
"this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
)
N = box_corners.shape[0]
overlap = np.zeros((N, ), dtype=box_corners.dtype)
BoxOps.rbbox_iou_aligned(tv.from_numpy(box_corners),
tv.from_numpy(qbox_corners),
tv.from_numpy(overlap), False)
return overlap
def non_max_suppression_cpu(boxes: np.ndarray,
order: np.ndarray,
thresh: float,
eps: float = 0.0):
return BoxOps.non_max_suppression_cpu(tv.from_numpy(boxes),
tv.from_numpy(order), thresh, eps)
def rotate_non_max_suppression_cpu(boxes: np.ndarray, order: np.ndarray,
standup_iou: np.ndarray, thresh: float):
if not BoxOps.has_boost():
raise NotImplementedError(
"this op require spconv built with boost, download boost, export BOOST_ROOT and rebuild."
)
return BoxOps.rotate_non_max_suppression_cpu(tv.from_numpy(boxes),
tv.from_numpy(order),
tv.from_numpy(standup_iou),
thresh)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment