Unverified Commit afa0341e authored by HandH1998's avatar HandH1998 Committed by GitHub
Browse files

Update Triton configs for block fp8 kernels (#2641)

parent 30828e71
{ {
"1": { "1": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 8,
"num_stages": 2 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -37,63 +37,63 @@ ...@@ -37,63 +37,63 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -101,7 +101,7 @@ ...@@ -101,7 +101,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -109,38 +109,38 @@ ...@@ -109,38 +109,38 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -117,15 +117,15 @@ ...@@ -117,15 +117,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -141,6 +141,6 @@ ...@@ -141,6 +141,6 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -21,15 +21,15 @@ ...@@ -21,15 +21,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -37,31 +37,31 @@ ...@@ -37,31 +37,31 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -69,31 +69,31 @@ ...@@ -69,31 +69,31 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -101,7 +101,7 @@ ...@@ -101,7 +101,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -109,23 +109,23 @@ ...@@ -109,23 +109,23 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,14 +133,14 @@ ...@@ -133,14 +133,14 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -13,63 +13,63 @@ ...@@ -13,63 +13,63 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -77,7 +77,7 @@ ...@@ -77,7 +77,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -93,47 +93,47 @@ ...@@ -93,47 +93,47 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -141,6 +141,6 @@ ...@@ -141,6 +141,6 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
...@@ -3,17 +3,17 @@ ...@@ -3,17 +3,17 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -21,55 +21,55 @@ ...@@ -21,55 +21,55 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -77,7 +77,7 @@ ...@@ -77,7 +77,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -93,15 +93,15 @@ ...@@ -93,15 +93,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -109,38 +109,38 @@ ...@@ -109,38 +109,38 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
{
"1": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"2": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"4": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"8": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"16": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"24": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
}
}
...@@ -3,17 +3,17 @@ ...@@ -3,17 +3,17 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -21,15 +21,15 @@ ...@@ -21,15 +21,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -37,39 +37,39 @@ ...@@ -37,39 +37,39 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -77,31 +77,31 @@ ...@@ -77,31 +77,31 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -109,38 +109,38 @@ ...@@ -109,38 +109,38 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
...@@ -3,33 +3,33 @@ ...@@ -3,33 +3,33 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -37,15 +37,15 @@ ...@@ -37,15 +37,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -53,7 +53,7 @@ ...@@ -53,7 +53,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -69,7 +69,7 @@ ...@@ -69,7 +69,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -77,15 +77,15 @@ ...@@ -77,15 +77,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -93,15 +93,15 @@ ...@@ -93,15 +93,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -109,23 +109,23 @@ ...@@ -109,23 +109,23 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,14 +133,14 @@ ...@@ -133,14 +133,14 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
...@@ -3,129 +3,129 @@ ...@@ -3,129 +3,129 @@
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,14 +133,14 @@ ...@@ -133,14 +133,14 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -21,23 +21,23 @@ ...@@ -21,23 +21,23 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -45,15 +45,15 @@ ...@@ -45,15 +45,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -61,15 +61,15 @@ ...@@ -61,15 +61,15 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -77,47 +77,47 @@ ...@@ -77,47 +77,47 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -125,7 +125,7 @@ ...@@ -125,7 +125,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,14 +133,14 @@ ...@@ -133,14 +133,14 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
...@@ -5,111 +5,111 @@ ...@@ -5,111 +5,111 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -117,7 +117,7 @@ ...@@ -117,7 +117,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -125,7 +125,7 @@ ...@@ -125,7 +125,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,14 +133,14 @@ ...@@ -133,14 +133,14 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
...@@ -5,142 +5,142 @@ ...@@ -5,142 +5,142 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 2 "num_stages": 3
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment