Unverified Commit a781e84e authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[Perf] Tune configs for triton block fp8 gemm H100/H200 (#23748)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent 1b7b161a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_block_fp8_matmul,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton as vllm_triton
assert current_platform.is_cuda(), (
"Only support benchmarking w8a8 block fp8 kernel on CUDA device."
)
# DeepSeek-V3 weight shapes
DEEPSEEK_V3_SHAPES = [
(512 + 64, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(7168, 16384),
(7168, 18432),
(18432 * 2, 7168),
(24576, 1536),
(12288, 7168),
(4096, 7168),
(7168, 2048),
]
def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
"""Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
# Create random FP8 tensors
A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
# Create scales
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
Bs = (
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
* factor_for_scale
)
def run():
return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
return run
@vllm_triton.testing.perf_report(
vllm_triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=["torch-bf16", "w8a8-block-fp8"],
line_names=["torch-bf16", "w8a8-block-fp8"],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs W8A8 Block FP8 GEMMs",
args={},
)
)
def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
M = batch_size
device = "cuda"
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else: # w8a8-block-fp8
run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: run_w8a8(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
if __name__ == "__main__":
block_size = (128, 128)
for N, K in DEEPSEEK_V3_SHAPES:
print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
print(f"TFLOP/s comparison (block_size={block_size}):")
benchmark_tflops.run(
print_data=True,
# show_plots=False,
# save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
N=N,
K=K,
block_size=block_size,
)
print("\nBenchmark finished!")
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 5
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 5
},
"32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 5
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 4
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 5
},
"24": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 3
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 3
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"24": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
}
}
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 4 "num_stages": 4
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 8, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -83,7 +83,7 @@ ...@@ -83,7 +83,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -107,7 +107,7 @@ ...@@ -107,7 +107,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -115,15 +115,15 @@ ...@@ -115,15 +115,15 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -133,13 +133,13 @@ ...@@ -133,13 +133,13 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
} }
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 4 "num_stages": 4
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 8, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -99,9 +99,9 @@ ...@@ -99,9 +99,9 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -115,7 +115,7 @@ ...@@ -115,7 +115,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -139,8 +139,8 @@ ...@@ -139,8 +139,8 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
} }
} }
\ No newline at end of file
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 8, "num_warps": 8,
"num_stages": 4 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 8, "num_warps": 8,
"num_stages": 4 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 8, "num_warps": 8,
"num_stages": 3 "num_stages": 4
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
...@@ -32,19 +32,19 @@ ...@@ -32,19 +32,19 @@
"num_stages": 3 "num_stages": 3
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 2
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 8,
"num_stages": 3 "num_stages": 3
}, },
"32": { "32": {
...@@ -59,9 +59,9 @@ ...@@ -59,9 +59,9 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -83,7 +83,7 @@ ...@@ -83,7 +83,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -123,7 +123,7 @@ ...@@ -123,7 +123,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -139,7 +139,7 @@ ...@@ -139,7 +139,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
} }
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 8, "num_warps": 8,
"num_stages": 4 "num_stages": 4
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 4 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 2
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 5
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 8,
"num_stages": 3 "num_stages": 3
}, },
"32": { "32": {
...@@ -59,15 +59,15 @@ ...@@ -59,15 +59,15 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 2
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -139,7 +139,7 @@ ...@@ -139,7 +139,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
} }
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 8,
"num_stages": 5 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
...@@ -59,31 +59,31 @@ ...@@ -59,31 +59,31 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -99,7 +99,7 @@ ...@@ -99,7 +99,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -107,7 +107,7 @@ ...@@ -107,7 +107,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -123,7 +123,7 @@ ...@@ -123,7 +123,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -131,7 +131,7 @@ ...@@ -131,7 +131,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 8, "num_warps": 8,
"num_stages": 3 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -59,33 +59,33 @@ ...@@ -59,33 +59,33 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -93,23 +93,23 @@ ...@@ -93,23 +93,23 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 3
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 5
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 5
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 3
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 5
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 5
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
}
}
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
...@@ -8,55 +8,55 @@ ...@@ -8,55 +8,55 @@
"num_stages": 5 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
...@@ -64,74 +64,74 @@ ...@@ -64,74 +64,74 @@
"num_stages": 4 "num_stages": 4
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -139,8 +139,8 @@ ...@@ -139,8 +139,8 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
...@@ -80,58 +80,58 @@ ...@@ -80,58 +80,58 @@
"num_stages": 5 "num_stages": 5
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -139,7 +139,7 @@ ...@@ -139,7 +139,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
} }
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 5 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
...@@ -16,26 +16,26 @@ ...@@ -16,26 +16,26 @@
"num_stages": 5 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
...@@ -43,9 +43,9 @@ ...@@ -43,9 +43,9 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -59,7 +59,7 @@ ...@@ -59,7 +59,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
...@@ -67,31 +67,31 @@ ...@@ -67,31 +67,31 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -101,31 +101,31 @@ ...@@ -101,31 +101,31 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -141,6 +141,6 @@ ...@@ -141,6 +141,6 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 8,
"num_stages": 5 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
...@@ -24,18 +24,18 @@ ...@@ -24,18 +24,18 @@
"num_stages": 5 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
...@@ -45,47 +45,47 @@ ...@@ -45,47 +45,47 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -93,29 +93,29 @@ ...@@ -93,29 +93,29 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -123,7 +123,7 @@ ...@@ -123,7 +123,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 8,
"num_stages": 5 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -59,7 +59,7 @@ ...@@ -59,7 +59,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
...@@ -73,19 +73,19 @@ ...@@ -73,19 +73,19 @@
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -99,21 +99,21 @@ ...@@ -99,21 +99,21 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
...@@ -123,9 +123,9 @@ ...@@ -123,9 +123,9 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"3072": { "3072": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
......
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 8,
"num_stages": 5 "num_stages": 5
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 5
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"48": { "48": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 5
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -73,25 +73,25 @@ ...@@ -73,25 +73,25 @@
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -99,31 +99,31 @@ ...@@ -99,31 +99,31 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
...@@ -141,6 +141,6 @@ ...@@ -141,6 +141,6 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
} }
} }
{ {
"1": { "1": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 5 "num_stages": 3
}, },
"2": { "2": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"4": { "4": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"8": { "8": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 5 "num_stages": 3
}, },
"16": { "16": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"24": { "24": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"32": { "32": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
...@@ -59,39 +59,39 @@ ...@@ -59,39 +59,39 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 3
}, },
"64": { "64": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"96": { "96": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 4 "num_stages": 4
}, },
"128": { "128": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"256": { "256": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 16,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -99,23 +99,23 @@ ...@@ -99,23 +99,23 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
"1024": { "1024": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
}, },
"1536": { "1536": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -123,7 +123,7 @@ ...@@ -123,7 +123,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -131,7 +131,7 @@ ...@@ -131,7 +131,7 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 64,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 3
}, },
...@@ -139,8 +139,8 @@ ...@@ -139,8 +139,8 @@
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64, "GROUP_SIZE_M": 32,
"num_warps": 4, "num_warps": 4,
"num_stages": 3 "num_stages": 4
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment