Commit 2309ebe5 authored by yan.yan's avatar yan.yan
Browse files

v2.3.3: fix some problem in int8

parent b52636d1
# Changelog # Changelog
## [2.3.3] - 2023-02-02
### Fixed
- Fix int8 nvrtc error when use prebuilt
- Fix int8 kernel when run on turing GPU
## [2.3.2] - 2023-01-20 ## [2.3.2] - 2023-01-20
### Changed ### Changed
- change version - change version
......
...@@ -57,11 +57,9 @@ ...@@ -57,11 +57,9 @@
| CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]| | CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]|
| CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]| | CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]|
| CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]| | CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]|
| CUDA 11.8* | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]| | CUDA 11.8 | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]|
| CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]|
*: sm_89 and sm_90 is added in CUDA 11.8. If you use RTX 4090 or H100, you should use this version.
<!-- | CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -->
```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs. ```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs.
......
[build-system] [build-system]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.4.5"] requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.4.5"]
# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu120-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"] # requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu120-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu117-0.4.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
...@@ -167,8 +167,8 @@ if disable_jit is not None and disable_jit == "1": ...@@ -167,8 +167,8 @@ if disable_jit is not None and disable_jit == "1":
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS
all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS) IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS)
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle)) # all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp)) # all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
cu = GemmMainUnitTest(all_shuffle) cu = GemmMainUnitTest(all_shuffle)
convcu = ConvMainUnitTest(all_imp) convcu = ConvMainUnitTest(all_imp)
......
...@@ -840,7 +840,7 @@ IMPLGEMM_TURING_PARAMS = [ ...@@ -840,7 +840,7 @@ IMPLGEMM_TURING_PARAMS = [
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
...@@ -855,7 +855,7 @@ IMPLGEMM_TURING_PARAMS = [ ...@@ -855,7 +855,7 @@ IMPLGEMM_TURING_PARAMS = [
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=0, access_per_vector=0,
...@@ -1127,7 +1127,7 @@ if not SPCONV_INT8_DEBUG: ...@@ -1127,7 +1127,7 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
...@@ -1142,7 +1142,7 @@ if not SPCONV_INT8_DEBUG: ...@@ -1142,7 +1142,7 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
...@@ -1157,13 +1157,13 @@ if not SPCONV_INT8_DEBUG: ...@@ -1157,13 +1157,13 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64), *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 32, 32),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1172,14 +1172,13 @@ if not SPCONV_INT8_DEBUG: ...@@ -1172,14 +1172,13 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 16, 32),
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1188,14 +1187,29 @@ if not SPCONV_INT8_DEBUG: ...@@ -1188,14 +1187,29 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 16, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32), *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1204,14 +1218,14 @@ if not SPCONV_INT8_DEBUG: ...@@ -1204,14 +1218,14 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64), *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1220,14 +1234,14 @@ if not SPCONV_INT8_DEBUG: ...@@ -1220,14 +1234,14 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
# TODO 16,8,32 produce wrong result.
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32), *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1236,14 +1250,14 @@ if not SPCONV_INT8_DEBUG: ...@@ -1236,14 +1250,14 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 16)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64), *gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1252,14 +1266,30 @@ if not SPCONV_INT8_DEBUG: ...@@ -1252,14 +1266,30 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
# TODO 16,8,32 produce wrong result.
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64), *gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1268,14 +1298,14 @@ if not SPCONV_INT8_DEBUG: ...@@ -1268,14 +1298,14 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
is_nvrtc=True, is_nvrtc=True,
int8_inference=True), int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (128, 128, 128), (64, 64, 128), *gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, 2,
...@@ -1284,7 +1314,7 @@ if not SPCONV_INT8_DEBUG: ...@@ -1284,7 +1314,7 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
...@@ -1300,7 +1330,7 @@ if not SPCONV_INT8_DEBUG: ...@@ -1300,7 +1330,7 @@ if not SPCONV_INT8_DEBUG:
NHWC, NHWC,
NHWC, NHWC,
GemmAlgo.Turing, GemmAlgo.Turing,
TensorOp((16, 8, 32)), TensorOp((8, 8, 16)),
mask_sparse=True, mask_sparse=True,
increment_k_first=True, increment_k_first=True,
access_per_vector=1, access_per_vector=1,
......
...@@ -330,10 +330,10 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -330,10 +330,10 @@ def _test_impgemm_conv_cuda(subm: bool):
device = torch.device("cuda:0") device = torch.device("cuda:0")
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1] batchsizes = [1]
dtypes = [(np.float32, np.float32), (np.float16, np.float16)] # dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
# dtypes = [np.float16] # dtypes = [np.float16]
# dtypes = [(np.int8, np.int8), (np.int8, np.float32), (np.int8, np.float16)] # dtypes = [(np.int8, np.int8), (np.int8, np.float32), (np.int8, np.float16)]
# dtypes = [(np.int8, np.int8)] dtypes = [(np.int8, np.int8)]
# dtypes = [(np.float16, np.float16)] # dtypes = [(np.float16, np.float16)]
test_case = TestCase() test_case = TestCase()
...@@ -341,6 +341,9 @@ def _test_impgemm_conv_cuda(subm: bool): ...@@ -341,6 +341,9 @@ def _test_impgemm_conv_cuda(subm: bool):
# out_channels = [32, 48, 64] # out_channels = [32, 48, 64]
in_channels = [32, 47] in_channels = [32, 47]
out_channels = [32, 48, 62] out_channels = [32, 48, 62]
in_channels = [16]
out_channels = [16]
# in_channels = [16] # in_channels = [16]
# out_channels = [16] # out_channels = [16]
......
...@@ -26,7 +26,7 @@ function repair_wheel { ...@@ -26,7 +26,7 @@ function repair_wheel {
} }
gcc -v gcc -v
export SPCONV_DISABLE_JIT="1" export SPCONV_DISABLE_JIT="1"
export CUMM_CUDA_ARCH_LIST="7.5" export CUMM_CUDA_ARCH_LIST="8.6"
# export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10" # export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
# Compile wheels, we only support 3.6-3.10. # Compile wheels, we only support 3.6-3.10.
# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp # "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment