Commit 2309ebe5 authored by yan.yan's avatar yan.yan
Browse files

v2.3.3: fix some problem in int8

parent b52636d1
# Changelog
## [2.3.3] - 2023-02-02
### Fixed
- Fix int8 nvrtc error when use prebuilt
- Fix int8 kernel when run on turing GPU
## [2.3.2] - 2023-01-20
### Changed
- change version
......
......@@ -57,11 +57,9 @@
| CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]|
| CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]|
| CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]|
| CUDA 11.8* | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]|
| CUDA 11.8 | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]|
| CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]|
*: sm_89 and sm_90 is added in CUDA 11.8. If you use RTX 4090 or H100, you should use this version.
<!-- | CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -->
```spconv``` is a project that provide heavily-optimized sparse convolution implementation with tensor core support. check [benchmark](docs/BENCHMARK.md) to see how fast spconv 2.x runs.
......
[build-system]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.4.5"]
# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu120-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu117-0.4.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
build-backend = "setuptools.build_meta"
......@@ -167,8 +167,8 @@ if disable_jit is not None and disable_jit == "1":
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS + SHUFFLE_AMPERE_PARAMS
all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
IMPLGEMM_TURING_PARAMS + IMPLGEMM_AMPERE_PARAMS)
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
# all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
# all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
cu = GemmMainUnitTest(all_shuffle)
convcu = ConvMainUnitTest(all_imp)
......
......@@ -840,7 +840,7 @@ IMPLGEMM_TURING_PARAMS = [
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
......@@ -855,7 +855,7 @@ IMPLGEMM_TURING_PARAMS = [
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=0,
......@@ -1127,7 +1127,7 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
......@@ -1142,7 +1142,7 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
......@@ -1157,13 +1157,13 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1172,14 +1172,13 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (16, 16, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1188,14 +1187,29 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 16, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 64), (32, 32, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1204,14 +1218,14 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1220,14 +1234,14 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
# TODO 16,8,32 produce wrong result.
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1236,14 +1250,14 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 16)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 64), (64, 32, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1252,14 +1266,30 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
# TODO 16,8,32 produce wrong result.
*gen_conv_params(ConvFwdAndBwdInput, (128, 64, 32), (64, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
["s8,s8,s8,s32,f32", "s8,s8,s8,s32,f16"],
NHWC,
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
*gen_conv_params(ConvFwdAndBwdInput, (128, 256, 64), (64, 128, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1268,14 +1298,14 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
is_nvrtc=True,
int8_inference=True),
*gen_conv_params(ConvFwdAndBwdInput, (128, 128, 128), (64, 64, 128),
*gen_conv_params(ConvFwdAndBwdInput, (256, 128, 64), (128, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2,
......@@ -1284,7 +1314,7 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
......@@ -1300,7 +1330,7 @@ if not SPCONV_INT8_DEBUG:
NHWC,
NHWC,
GemmAlgo.Turing,
TensorOp((16, 8, 32)),
TensorOp((8, 8, 16)),
mask_sparse=True,
increment_k_first=True,
access_per_vector=1,
......
......@@ -330,10 +330,10 @@ def _test_impgemm_conv_cuda(subm: bool):
device = torch.device("cuda:0")
shapes = [[19, 18, 17]]
batchsizes = [1]
dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
# dtypes = [(np.float32, np.float32), (np.float16, np.float16)]
# dtypes = [np.float16]
# dtypes = [(np.int8, np.int8), (np.int8, np.float32), (np.int8, np.float16)]
# dtypes = [(np.int8, np.int8)]
dtypes = [(np.int8, np.int8)]
# dtypes = [(np.float16, np.float16)]
test_case = TestCase()
......@@ -341,6 +341,9 @@ def _test_impgemm_conv_cuda(subm: bool):
# out_channels = [32, 48, 64]
in_channels = [32, 47]
out_channels = [32, 48, 62]
in_channels = [16]
out_channels = [16]
# in_channels = [16]
# out_channels = [16]
......
......@@ -26,7 +26,7 @@ function repair_wheel {
}
gcc -v
export SPCONV_DISABLE_JIT="1"
export CUMM_CUDA_ARCH_LIST="7.5"
export CUMM_CUDA_ARCH_LIST="8.6"
# export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
# Compile wheels, we only support 3.6-3.10.
# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment