Commit b0f52b8a authored by yan.yan's avatar yan.yan
Browse files

v2.1.20:: add simple fp16 support for pascal

parent 370334aa
# Changelog # Changelog
## [2.1.20] - 2021-12-6
### Added
- Add fp16 conv simt kernels for mixed-training in pascal or older GPUS. WARNING: not optimized for TESLA P100 which has 2x throughput in half.
## [2.1.19] - 2021-12-3 ## [2.1.19] - 2021-12-3
### Fixed ### Fixed
- Fix wrong arch assert in all kernels for old GPUs to make spconv work in sm_50 GPUs - Fix wrong arch assert in all kernels for old GPUs to make spconv work in sm_50 GPUs
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
## Short Guide ## Short Guide
* If you train without Tensor Core (i.e. FP32 training), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually. Default Algorithm is ```ConvAlgo.MaskImplicitGemm```, which is **SLOWER** than ```ConvAlgo.Native``` when use float32. this will be fixed in spconv 2.2. * If you train without Tensor Core (i.e. FP32 training or FP16 training for Pascal or older GPUS), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually. Default Algorithm is ```ConvAlgo.MaskImplicitGemm```, which is **SLOWER** than ```ConvAlgo.Native``` when use float32. this will be fixed in spconv 2.2.
* If your GPU support Tensor Core, use FP16 (mixed precision training) if possible. * If your GPU support Tensor Core, use FP16 (mixed precision training) if possible.
* If you train with mixed precision training (use Tensor Core), you don't need to set algorithm manually. * If you train with mixed precision training (use Tensor Core), you don't need to set algorithm manually.
* Currently fast algorithm only support kernel volume (prod of kernel size) <= 32, so don't use large kernel size. * Currently fast algorithm only support kernel volume (prod of kernel size) <= 32, so don't use large kernel size.
......
...@@ -85,18 +85,18 @@ SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [ ...@@ -85,18 +85,18 @@ SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [
"f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None), "f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
# fall back kernels if mat is misaligned for half # fall back kernels if mat is misaligned for half
# TODO use access-per-vector kernel instead of simt kernel for fallback # TODO use access-per-vector kernel instead of simt kernel for fallback
*gen_shuffle_params((128, 128, 8), (32, 64, 8), ["f16,f16,f16,f16,f16"], *gen_shuffle_params((128, 128, 8), (32, 64, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None), "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((32, 64, 32), (32, 32, 8), ["f16,f16,f16,f16,f16"], *gen_shuffle_params((32, 64, 32), (32, 32, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None), "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((32, 32, 32), (32, 32, 8), ["f16,f16,f16,f16,f16"], *gen_shuffle_params((32, 32, 32), (32, 32, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None), "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
# *gen_shuffle_params( # *gen_shuffle_params(
# (64, 64, 16), # (64, 64, 16),
# (32, 32, 8), ["f16,f16,f16,f16,f16"], "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None), # (32, 32, 8), ["f16,f16,f16,f16,f16"], "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((64, 128, 16), (32, 64, 8), ["f16,f16,f16,f16,f16"], *gen_shuffle_params((64, 128, 16), (32, 64, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None), "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((64, 64, 8), (32, 32, 8), ["f16,f16,f16,f16,f16"], *gen_shuffle_params((64, 64, 8), (32, 32, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None), "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
] ]
...@@ -189,11 +189,13 @@ SHUFFLE_TURING_PARAMS: List[GemmAlgoParams] = [ ...@@ -189,11 +189,13 @@ SHUFFLE_TURING_PARAMS: List[GemmAlgoParams] = [
] ]
# SHUFFLE_TURING_PARAMS = [] # SHUFFLE_TURING_PARAMS = []
# here we must use f32 for simt f16 accumulators because
# half intristics is VERY SLOW in GTX 1000 series.
IMPLGEMM_SIMT_PARAMS = [ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 16), (32, 32, 8), *gen_conv_params(ConvFwdAndBwdInput, (32, 128, 16), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -205,7 +207,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -205,7 +207,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 256, 8), (32, 64, 8), *gen_conv_params(ConvFwdAndBwdInput, (32, 256, 8), (32, 64, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -217,7 +219,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -217,7 +219,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 64, 16), (32, 32, 8), *gen_conv_params(ConvFwdAndBwdInput, (32, 64, 16), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -229,7 +231,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -229,7 +231,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 32, 8), *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -241,7 +243,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -241,7 +243,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 256, 8), (32, 64, 8), *gen_conv_params(ConvFwdAndBwdInput, (64, 256, 8), (32, 64, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -253,7 +255,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -253,7 +255,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 8), (32, 64, 8), *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 8), (32, 64, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -265,7 +267,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -265,7 +267,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 8), (32, 32, 8), *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 8), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -277,7 +279,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -277,7 +279,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8), *gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -289,7 +291,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -289,7 +291,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (32, 128, 16), (32, 32, 8), *gen_conv_params(ConvBwdWeight, (32, 128, 16), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -303,7 +305,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -303,7 +305,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (32, 64, 16), (32, 32, 8), *gen_conv_params(ConvBwdWeight, (32, 64, 16), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -315,7 +317,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -315,7 +317,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (32, 32, 32), (32, 32, 8), *gen_conv_params(ConvBwdWeight, (32, 32, 32), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -327,7 +329,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -327,7 +329,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 256, 8), (32, 64, 8), *gen_conv_params(ConvBwdWeight, (64, 256, 8), (32, 64, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -339,7 +341,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -339,7 +341,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 128, 8), (32, 64, 8), *gen_conv_params(ConvBwdWeight, (64, 128, 8), (32, 64, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -351,7 +353,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -351,7 +353,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 64, 8), (32, 32, 8), *gen_conv_params(ConvBwdWeight, (64, 64, 8), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -363,7 +365,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -363,7 +365,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8), *gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -375,7 +377,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -375,7 +377,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (128, 128, 8), (32, 64, 8), *gen_conv_params(ConvBwdWeight, (128, 128, 8), (32, 64, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
...@@ -387,7 +389,7 @@ IMPLGEMM_SIMT_PARAMS = [ ...@@ -387,7 +389,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (128, 64, 8), (64, 32, 8), *gen_conv_params(ConvBwdWeight, (128, 64, 8), (64, 32, 8),
NDIM_DONT_CARE, NDIM_DONT_CARE,
ConvIterAlgo.Optimized, ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"], 2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC, NHWC,
NHWC, NHWC,
NHWC, NHWC,
......
2.1.19 2.1.20
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment