v2.3.3: fix some problem in int8

2309ebe5 · yan.yan · b52636d1 · 2309ebe5 · 2309ebe5 · 2309ebe5
Commit 2309ebe5 authored Feb 02, 2023 by yan.yan
8 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog
+## [2.3.3] - 2023-02-02
+### Fixed 
+- Fix int8 nvrtc error when use prebuilt
+- Fix int8 kernel when run on turing GPU
 ## [2.3.2] - 2023-01-20
 ### Changed 
 - change version

--- a/README.md
+++ b/README.md
@@ -57,11 +57,9 @@
 | CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]|
 | CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]|
 | CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]| 
-| CUDA 11.8* | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| 
+| CUDA 11.8 | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| 
+| CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| 
-*: sm_89 and sm_90 is added in CUDA 11.8. If you use RTX 4090 or H100, you should use this version.
-<!-- | CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -->
 ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs.

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
 requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.4.5"]
 # requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu120-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
+# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu117-0.4.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
 build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@@ -167,8 +167,8 @@ if disable_jit is not None and disable_jit == "1":
    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS
    all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
               IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS)
-    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
+    # all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
-    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
+    # all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
    cu = GemmMainUnitTest(all_shuffle)
    convcu = ConvMainUnitTest(all_imp)

--- a/spconv/core.py
+++ b/spconv/core.py
@@ -840,7 +840,7 @@ IMPLGEMM_TURING_PARAMS = [
                     NHWC,
                     NHWC,
                     GemmAlgo.Turing,
-                     TensorOp((16, 8, 16)),
+                     TensorOp((8, 8, 16)),
                     mask_sparse=True,
                     increment_k_first=True,
                     access_per_vector=1,
@@ -855,7 +855,7 @@ IMPLGEMM_TURING_PARAMS = [
                     NHWC,
                     NHWC,
                     GemmAlgo.Turing,
-                     TensorOp((16, 8, 16)),
+                     TensorOp((8, 8, 16)),
                     mask_sparse=True,
                     increment_k_first=True,
                     access_per_vector=0,
@@ -1127,7 +1127,7 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
@@ -1142,7 +1142,7 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
@@ -1157,13 +1157,13 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 32, 32),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1172,14 +1172,13 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
+        *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 16, 32),
-        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1188,14 +1187,29 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
+                        mask_sparse=True,
+                        increment_k_first=True,
+                        access_per_vector=1,
+                        is_nvrtc=True,
+                        int8_inference=True),
+        *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 16, 32),
+                        NDIM_DONT_CARE,
+                        ConvIterAlgo.Optimized,
+                        2,
+                        ["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
+                        NHWC,
+                        NHWC,
+                        NHWC,
+                        GemmAlgo.Turing,
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
+        *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1204,14 +1218,14 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1220,14 +1234,14 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        # TODO 16,8,32 produce wrong result.
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
+        *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1236,14 +1250,14 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 16)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1252,14 +1266,30 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
+                        mask_sparse=True,
+                        increment_k_first=True,
+                        access_per_vector=1,
+                        is_nvrtc=True,
+                        int8_inference=True),
+        # TODO 16,8,32 produce wrong result.
+        *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
+                        NDIM_DONT_CARE,
+                        ConvIterAlgo.Optimized,
+                        2,
+                        ["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
+                        NHWC,
+                        NHWC,
+                        NHWC,
+                        GemmAlgo.Turing,
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
+        *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1268,14 +1298,14 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
                        is_nvrtc=True,
                        int8_inference=True),
-        *gen_conv_params(ConvFwdAndBwdInput, (128, 128, 128), (64, 64, 128),
+        *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
                        NDIM_DONT_CARE,
                        ConvIterAlgo.Optimized,
                        2,
@@ -1284,7 +1314,7 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,
@@ -1300,7 +1330,7 @@ if not SPCONV_INT8_DEBUG:
                        NHWC,
                        NHWC,
                        GemmAlgo.Turing,
-                        TensorOp((16, 8, 32)),
+                        TensorOp((8, 8, 16)),
                        mask_sparse=True,
                        increment_k_first=True,
                        access_per_vector=1,

--- a/test/test_all_algo.py
+++ b/test/test_all_algo.py
@@ -330,10 +330,10 @@ def _test_impgemm_conv_cuda(subm: bool):
    device = torch.device("cuda:0")
    shapes = [[19, 18, 17]]
    batchsizes = [1]
-    dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
+    # dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
    # dtypes = [np.float16]
    # dtypes = [(np.int8, np.int8), (np.int8, np.float32), (np.int8, np.float16)]
-    # dtypes = [(np.int8, np.int8)]
+    dtypes = [(np.int8, np.int8)]
    # dtypes = [(np.float16, np.float16)]
    test_case = TestCase()
@@ -341,6 +341,9 @@ def _test_impgemm_conv_cuda(subm: bool):
    # out_channels = [32, 48, 64]
    in_channels = [32, 47]
    out_channels = [32, 48, 62]
+    in_channels = [16]
+    out_channels = [16]
    # in_channels = [16]
    # out_channels = [16]

--- a/tools/build-wheels-dev.sh
+++ b/tools/build-wheels-dev.sh
@@ -26,7 +26,7 @@ function repair_wheel {
 }
 gcc -v
 export SPCONV_DISABLE_JIT="1"
-export CUMM_CUDA_ARCH_LIST="7.5"
+export CUMM_CUDA_ARCH_LIST="8.6"
 # export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
 # Compile wheels, we only support 3.6-3.10.
 # "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp

--- a/version.txt
+++ b/version.txt
-2.3.2
+2.3.3