v2.1.3: fix a bug in cpu only

f31eee3a · yan.yan · abd3638c · f31eee3a · f31eee3a · f31eee3a
Commit f31eee3a authored Nov 08, 2021 by yan.yan
7 changed files
--- a/README.md
+++ b/README.md
@@ -14,9 +14,13 @@
 limitations under the License.
 -->

+[pypi-download]: https://img.shields.io/pypi/dm/spconv-cu114
+[pypi-url]: https://pypi.org/project/spconv-cu114/
+[pypi-image]: https://badge.fury.io/py/spconv-cu114.svg
+
 # SpConv: Spatially Sparse Convolution Library
+[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild) [![PyPI Version][pypi-image]][pypi-url] [![pypi monthly download][pypi-download]][pypi-url]

-[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)

 ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support.

@@ -28,7 +32,7 @@ Spconv 1.x users **NEED READ [THIS](docs/SPCONV_2_BREAKING_CHANGEs.md)** before

 ## Spconv 2.1 vs Spconv 1.x

-* spconv now can be installed by **pip**. see install section in readme for more details.
+* spconv now can be installed by **pip**. see install section in readme for more details. Users don't need to build manually anymore!
 * Microsoft Windows support (only windows 10 has been tested).
 * fp32 (not tf32) training/inference speed is increased (+50~80%)
 * fp16 training/inference speed is greatly increased when your layer support tensor core (channel size must be multiple of 8).
@@ -87,6 +91,7 @@ CUDA 11.1 will be removed in spconv 2.2 because pytorch 1.10 don't provide prebu

 **NOTE** It's safe to have different **minor** cuda version between system and conda (pytorch) **in Linux**. for example, you can use spconv-cu114 with anaconda version of pytorch cuda 11.1 in a OS with CUDA 11.2 installed.

+**NOTE** In Linux, you can install spconv-cuxxx without install CUDA to system! only suitable NVIDIA driver is required. for CUDA 11, we need driver >= 450.82.

 ### Build from source for development (JIT, recommend)

@@ -147,7 +152,7 @@ You need to rebuild ```cumm``` first if you are build along a CUDA version that

 ## Note

-The work is done when the author is an employee at Tusimple.
+The work is done when the author is an employee at [Tusimple](https://www.tusimple.com/).

 ## LICENSE


--- a/docs/PERFORMANCE_GUIDE.md
+++ b/docs/PERFORMANCE_GUIDE.md
@@ -18,11 +18,12 @@

 ## Short Guide

-* If you train without Tensor Core (i.e. FP32 training), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually.
+* If you train without Tensor Core (i.e. FP32 training), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually. Default Algorithm is ```ConvAlgo.MaskImplicitGemm```, which is **SLOWER** than ```ConvAlgo.Native``` when use float32. this will be fixed in spconv 2.2.
 * If your GPU support Tensor Core, use FP16 (mixed precision training) if possible. 
 * If you train with mixed precision training (use Tensor Core), you don't need to set algorithm manually.
 * Currently fast algorithm only support kernel volume (prod of kernel size) <= 32, so don't use large kernel size.
 * make sure your channel size is multiple of 8 when using fp16. multiple of 32 is better.
+* spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible.

 Network Benchmark without batchnorm (F32/F16) in RTX 3080 Laptop GPU


--- a/example/mnist_sparse.py
+++ b/example/mnist_sparse.py
@@ -65,7 +65,7 @@ class Net(nn.Module):
 def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    scaler = torch.cuda.amp.grad_scaler.GradScaler()
-    amp_ctx = identity_ctx()
+    amp_ctx = contextlib.nullcontext()
    if args.fp16:
        amp_ctx = torch.cuda.amp.autocast()
    for batch_idx, (data, target) in enumerate(train_loader):
@@ -105,7 +105,7 @@ def test(args, model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
-    amp_ctx = identity_ctx()
+    amp_ctx = contextlib.nullcontext()
    if args.fp16:
        amp_ctx = torch.cuda.amp.autocast()


--- a/spconv/cppconstants.py
+++ b/spconv/cppconstants.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import spconv.core_cc as _ext
+
+if hasattr(_ext, "cumm"):
+    CPU_ONLY_BUILD = False
+else:
+    CPU_ONLY_BUILD = True 
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -26,6 +26,7 @@ from spconv import pytorch as spconv
 from spconv.core import ConvAlgo
 import spconv.pytorch.functional as Fsp
 from spconv.pytorch import ops
+from spconv.cppconstants import CPU_ONLY_BUILD
 from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData
 from spconv.pytorch.modules import SparseModule
 from spconv.constants import FILTER_HWIO
@@ -117,7 +118,7 @@ class SparseConvolution(SparseModule):
        self.subm = subm
        self.indice_key = indice_key
        if algo is None:
-            if kv <= 32:
+            if kv <= 32 and not CPU_ONLY_BUILD:
                if kv < 8:
                    algo = ConvAlgo.MaskImplicitGemm
                else:
@@ -126,7 +127,8 @@ class SparseConvolution(SparseModule):
                algo = ConvAlgo.Native
        if kv > 32:
            assert algo == ConvAlgo.Native, "implicit gemm don't support kv >= 32 for now"
-
+        if CPU_ONLY_BUILD:
+            assert algo == ConvAlgo.Native, "cpu only build only support native algorithm"
        self.algo = algo
        # self.algo = ConvAlgo.Native
        if self.algo == ConvAlgo.Native:

--- a/spconv/pytorch/pool.py
+++ b/spconv/pytorch/pool.py
@@ -28,6 +28,7 @@ import spconv.pytorch.functional as Fsp
 from spconv.pytorch import ops
 from spconv.pytorch.core import IndiceData, ImplicitGemmIndiceData
 from spconv.pytorch.modules import SparseModule
+from spconv.cppconstants import CPU_ONLY_BUILD


 class SparseMaxPool(SparseModule):
@@ -63,7 +64,7 @@ class SparseMaxPool(SparseModule):
        if algo is None:
            # keep in mind that this algorithm is set for Inverse Sparse Conv
            # maxpool itself don't need mask.
-            if kv <= 32:
+            if kv <= 32 and not CPU_ONLY_BUILD:
                if kv < 8:
                    algo = ConvAlgo.MaskImplicitGemm
                else:
@@ -72,6 +73,8 @@ class SparseMaxPool(SparseModule):
                algo = ConvAlgo.Native
        if kv > 32:
            assert algo == ConvAlgo.Native, "implicit gemm don't support kv >= 32 for now"
+        if CPU_ONLY_BUILD:
+            assert algo == ConvAlgo.Native, "cpu only build only support native algorithm"

        self.algo = algo


--- a/version.txt
+++ b/version.txt
-2.1.2
\ No newline at end of file
+2.1.3
\ No newline at end of file