Commit b0f52b8a authored by yan.yan's avatar yan.yan
Browse files

v2.1.20:: add simple fp16 support for pascal

parent 370334aa
# Changelog
## [2.1.20] - 2021-12-6
### Added
- Add fp16 conv simt kernels for mixed-training in pascal or older GPUS. WARNING: not optimized for TESLA P100 which has 2x throughput in half.
## [2.1.19] - 2021-12-3
### Fixed
- Fix wrong arch assert in all kernels for old GPUs to make spconv work in sm_50 GPUs
......
......@@ -18,7 +18,7 @@
## Short Guide
* If you train without Tensor Core (i.e. FP32 training), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually. Default Algorithm is ```ConvAlgo.MaskImplicitGemm```, which is **SLOWER** than ```ConvAlgo.Native``` when use float32. this will be fixed in spconv 2.2.
* If you train without Tensor Core (i.e. FP32 training or FP16 training for Pascal or older GPUS), set all ```algo``` in convolution/maxpool to ```ConvAlgo.Native``` manually. Default Algorithm is ```ConvAlgo.MaskImplicitGemm```, which is **SLOWER** than ```ConvAlgo.Native``` when use float32. this will be fixed in spconv 2.2.
* If your GPU support Tensor Core, use FP16 (mixed precision training) if possible.
* If you train with mixed precision training (use Tensor Core), you don't need to set algorithm manually.
* Currently fast algorithm only support kernel volume (prod of kernel size) <= 32, so don't use large kernel size.
......
......@@ -85,18 +85,18 @@ SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [
"f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
# fall back kernels if mat is misaligned for half
# TODO use access-per-vector kernel instead of simt kernel for fallback
*gen_shuffle_params((128, 128, 8), (32, 64, 8), ["f16,f16,f16,f16,f16"],
*gen_shuffle_params((128, 128, 8), (32, 64, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((32, 64, 32), (32, 32, 8), ["f16,f16,f16,f16,f16"],
*gen_shuffle_params((32, 64, 32), (32, 32, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((32, 32, 32), (32, 32, 8), ["f16,f16,f16,f16,f16"],
*gen_shuffle_params((32, 32, 32), (32, 32, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
# *gen_shuffle_params(
# (64, 64, 16),
# (32, 32, 8), ["f16,f16,f16,f16,f16"], "f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((64, 128, 16), (32, 64, 8), ["f16,f16,f16,f16,f16"],
*gen_shuffle_params((64, 128, 16), (32, 64, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((64, 64, 8), (32, 32, 8), ["f16,f16,f16,f16,f16"],
*gen_shuffle_params((64, 64, 8), (32, 32, 8), ["f16,f16,f16,f32,f32"],
"f16,f16,f16,f32,f32", 2, kernel.GemmAlgo.Simt, None),
]
......@@ -189,11 +189,13 @@ SHUFFLE_TURING_PARAMS: List[GemmAlgoParams] = [
]
# SHUFFLE_TURING_PARAMS = []
# here we must use f32 for simt f16 accumulators because
# half intristics is VERY SLOW in GTX 1000 series.
IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -205,7 +207,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 256, 8), (32, 64, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -217,7 +219,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 64, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -229,7 +231,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -241,7 +243,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 256, 8), (32, 64, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -253,7 +255,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 8), (32, 64, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -265,7 +267,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 8), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -277,7 +279,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -289,7 +291,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (32, 128, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -303,7 +305,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (32, 64, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -315,7 +317,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (32, 32, 32), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -327,7 +329,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 256, 8), (32, 64, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -339,7 +341,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 128, 8), (32, 64, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -351,7 +353,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 64, 8), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -363,7 +365,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -375,7 +377,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (128, 128, 8), (32, 64, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -387,7 +389,7 @@ IMPLGEMM_SIMT_PARAMS = [
*gen_conv_params(ConvBwdWeight, (128, 64, 8), (64, 32, 8),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f32,f32,f32,f32,f32"],
2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......
2.1.19
\ No newline at end of file
2.1.20
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment