Unverified Commit fdf04a14 authored by yigex's avatar yigex Committed by GitHub
Browse files

[ROCm] Add ROCm tuning config to block gemm and Re-tune for AMD Radeon Graphics (#3418)


Co-authored-by: default avatarBruce Xue <yigex@xilinx.com>
Co-authored-by: default avatarHAI <hixiao@gmail.com>
parent 5f0e7de3
...@@ -23,8 +23,13 @@ import torch ...@@ -23,8 +23,13 @@ import torch
import triton import triton
from tqdm import tqdm from tqdm import tqdm
from sglang.srt.layers.quantization.fp8_kernel import _w8a8_block_fp8_matmul from sglang.srt.layers.quantization.fp8_kernel import (
from sglang.srt.utils import get_device_name _w8a8_block_fp8_matmul,
_w8a8_block_fp8_matmul_unrolledx4,
)
from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
is_hip_ = is_hip()
DTYPE_MAP = { DTYPE_MAP = {
"float32": torch.float32, "float32": torch.float32,
...@@ -80,7 +85,19 @@ def w8a8_block_fp8_matmul( ...@@ -80,7 +85,19 @@ def w8a8_block_fp8_matmul(
triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
) )
_w8a8_block_fp8_matmul[grid]( # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
# Empirical testing shows the sweet spot lies when it's less than the # of
# compute units available on the device.
num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
N, config["BLOCK_SIZE_N"]
)
kernel = (
_w8a8_block_fp8_matmul_unrolledx4
if (is_hip_ == True and num_workgroups <= get_device_core_count())
else _w8a8_block_fp8_matmul
)
kernel[grid](
A, A,
B, B,
C, C,
...@@ -107,14 +124,15 @@ def w8a8_block_fp8_matmul( ...@@ -107,14 +124,15 @@ def w8a8_block_fp8_matmul(
return C return C
def get_configs_compute_bound(): def get_rocm_configs_compute_bound():
configs = [] configs = []
for num_stages in [2, 3, 4, 5]: waves_per_eu_range = 0
for block_m in [16, 32, 64, 128, 256]: for num_stages in [2]:
for block_k in [64, 128]: for block_m in [32, 64, 128, 256]:
for block_n in [32, 64, 128, 256]: for block_k in [32, 64, 128, 256]:
for block_n in [16, 32, 64, 128, 256]:
for num_warps in [4, 8]: for num_warps in [4, 8]:
for group_size in [1, 16, 32, 64]: for group_size in [1, 4, 8, 16, 32]:
configs.append( configs.append(
{ {
"BLOCK_SIZE_M": block_m, "BLOCK_SIZE_M": block_m,
...@@ -123,11 +141,36 @@ def get_configs_compute_bound(): ...@@ -123,11 +141,36 @@ def get_configs_compute_bound():
"GROUP_SIZE_M": group_size, "GROUP_SIZE_M": group_size,
"num_warps": num_warps, "num_warps": num_warps,
"num_stages": num_stages, "num_stages": num_stages,
"waves_per_eu": waves_per_eu_range,
} }
) )
return configs return configs
def get_configs_compute_bound():
configs = []
if is_hip_:
configs = get_rocm_configs_compute_bound()
else:
for num_stages in [2, 3, 4, 5]:
for block_m in [16, 32, 64, 128, 256]:
for block_k in [64, 128]:
for block_n in [32, 64, 128, 256]:
for num_warps in [4, 8]:
for group_size in [1, 16, 32, 64]:
configs.append(
{
"BLOCK_SIZE_M": block_m,
"BLOCK_SIZE_N": block_n,
"BLOCK_SIZE_K": block_k,
"GROUP_SIZE_M": group_size,
"num_warps": num_warps,
"num_stages": num_stages,
}
)
return configs
def get_weight_shapes(tp_size): def get_weight_shapes(tp_size):
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model. # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
# cannot TP # cannot TP
...@@ -190,14 +233,18 @@ def benchmark_config( ...@@ -190,14 +233,18 @@ def benchmark_config(
def tune(M, N, K, block_size, out_dtype, search_space): def tune(M, N, K, block_size, out_dtype, search_space):
factor_for_scale = 1e-2 factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn) fp8_info = torch.finfo(torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min fp8_max, fp8_min = fp8_info.max, fp8_info.min
A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(
torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
)
B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(
torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
)
block_n, block_k = block_size[0], block_size[1] block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n n_tiles = (N + block_n - 1) // block_n
......
...@@ -72,10 +72,10 @@ ...@@ -72,10 +72,10 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 256, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
...@@ -54,10 +54,10 @@ ...@@ -54,10 +54,10 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -102,43 +102,43 @@ ...@@ -102,43 +102,43 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 128, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -147,16 +147,16 @@ ...@@ -147,16 +147,16 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -93,23 +93,23 @@ ...@@ -93,23 +93,23 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"512": { "512": {
"BLOCK_SIZE_M": 128, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -118,27 +118,27 @@ ...@@ -118,27 +118,27 @@
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -147,7 +147,7 @@ ...@@ -147,7 +147,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -156,7 +156,7 @@ ...@@ -156,7 +156,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -66,43 +66,43 @@ ...@@ -66,43 +66,43 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -117,17 +117,17 @@ ...@@ -117,17 +117,17 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
...@@ -138,7 +138,7 @@ ...@@ -138,7 +138,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -75,32 +75,32 @@ ...@@ -75,32 +75,32 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -109,7 +109,7 @@ ...@@ -109,7 +109,7 @@
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -120,16 +120,16 @@ ...@@ -120,16 +120,16 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -138,7 +138,7 @@ ...@@ -138,7 +138,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -147,7 +147,7 @@ ...@@ -147,7 +147,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -156,7 +156,7 @@ ...@@ -156,7 +156,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -27,16 +27,16 @@ ...@@ -27,16 +27,16 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -54,28 +54,28 @@ ...@@ -54,28 +54,28 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -84,7 +84,7 @@ ...@@ -84,7 +84,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -93,7 +93,7 @@ ...@@ -93,7 +93,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -117,8 +117,8 @@ ...@@ -117,8 +117,8 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
...@@ -126,8 +126,8 @@ ...@@ -126,8 +126,8 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
...@@ -136,27 +136,27 @@ ...@@ -136,27 +136,27 @@
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -27,34 +27,34 @@ ...@@ -27,34 +27,34 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -63,19 +63,19 @@ ...@@ -63,19 +63,19 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -84,7 +84,7 @@ ...@@ -84,7 +84,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -93,7 +93,7 @@ ...@@ -93,7 +93,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -102,7 +102,7 @@ ...@@ -102,7 +102,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -111,32 +111,32 @@ ...@@ -111,32 +111,32 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
...@@ -144,19 +144,19 @@ ...@@ -144,19 +144,19 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -45,19 +45,19 @@ ...@@ -45,19 +45,19 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -66,14 +66,14 @@ ...@@ -66,14 +66,14 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -81,8 +81,8 @@ ...@@ -81,8 +81,8 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -90,17 +90,17 @@ ...@@ -90,17 +90,17 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -118,7 +118,7 @@ ...@@ -118,7 +118,7 @@
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -127,7 +127,7 @@ ...@@ -127,7 +127,7 @@
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -72,8 +72,8 @@ ...@@ -72,8 +72,8 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -81,8 +81,8 @@ ...@@ -81,8 +81,8 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -99,8 +99,8 @@ ...@@ -99,8 +99,8 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"256": { "256": {
"BLOCK_SIZE_M": 128, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -111,14 +111,14 @@ ...@@ -111,14 +111,14 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
...@@ -127,9 +127,9 @@ ...@@ -127,9 +127,9 @@
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -138,7 +138,7 @@ ...@@ -138,7 +138,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -147,7 +147,7 @@ ...@@ -147,7 +147,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -156,7 +156,7 @@ ...@@ -156,7 +156,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
...@@ -64,52 +64,52 @@ ...@@ -64,52 +64,52 @@
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"512": { "512": {
"BLOCK_SIZE_M": 128, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
...@@ -117,28 +117,28 @@ ...@@ -117,28 +117,28 @@
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 8,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
...@@ -156,7 +156,7 @@ ...@@ -156,7 +156,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 2, "num_stages": 2,
"waves_per_eu": 0 "waves_per_eu": 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment