Commit bb596f6e authored by xiaowei.zhang's avatar xiaowei.zhang
Browse files

1. Update MOE; 2. Update sglang mHC; 3. Update test scripts; 4 Add new

   ops.
parent d9ebb683
import functools
from typing import Tuple
import tilelang
from tilelang import language as T
_PASS_CONFIGS = {
tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
}
@functools.cache
def mhc_pre_gemm_sqrsum_splitk_kernel(
mhc_mult3: int,
mhc_hidden_size: int,
split_k: int,
token_block: int = 64,
hidden_block: int = 256,
threads: int = 256,
) -> Tuple[tilelang.JITKernel, tilelang.JITKernel]:
assert mhc_mult3 <= 32
assert mhc_hidden_size % hidden_block == 0
assert mhc_hidden_size % split_k == 0
split_size = mhc_hidden_size // split_k
assert split_size % hidden_block == 0
num_tokens = T.dynamic("num_tokens")
@tilelang.jit(pass_configs=_PASS_CONFIGS)
def mhc_pre_gemm_sqrsum_splitk_stage_0(
x: T.Tensor[(num_tokens, mhc_hidden_size), T.bfloat16],
fn: T.Tensor[(mhc_mult3, mhc_hidden_size), T.float32],
out_partial: T.Tensor[(split_k, num_tokens, mhc_mult3), T.float32],
sqrsum_partial: T.Tensor[(split_k, num_tokens), T.float32],
):
with T.Kernel(split_k, T.ceildiv(num_tokens, token_block), threads=threads) as (
bz,
px,
):
out_frag = T.alloc_fragment((token_block, 32), T.float32)
sq_part4 = T.alloc_fragment((token_block, 16), T.float32)
T.clear(out_frag)
T.clear(sq_part4)
k_base = bz * split_size
for pz in T.Pipelined(split_size // hidden_block, num_stages=0):
x_frag_pre = T.alloc_fragment((token_block, hidden_block), T.bfloat16)
fn_frag_pre = T.alloc_fragment((32, hidden_block), T.float32)
x_frag_16 = T.alloc_fragment((token_block, hidden_block), T.bfloat16)
x_frag = T.alloc_fragment((token_block, hidden_block), T.float32)
fn_frag = T.alloc_fragment((32, hidden_block), T.float32)
x_smem_16 = T.alloc_shared((token_block, hidden_block), T.bfloat16)
fn_smem = T.alloc_shared((32, hidden_block), T.float32)
T.annotate_layout({x_smem_16: tilelang.layout.make_hcu_swizzled_layout(x_smem_16, major_pack=2)})
T.annotate_layout({fn_smem: tilelang.layout.make_hcu_swizzled_layout(fn_smem, major_pack=2)})
T.copy(x[px * token_block, k_base + pz * hidden_block], x_frag_pre)
T.copy(fn[0, k_base + pz * hidden_block], fn_frag_pre)
T.copy(x_frag_pre, x_smem_16)
T.copy(x_smem_16, x_frag_16)
T.copy(x_frag_16, x_frag)
T.copy(fn_frag_pre, fn_smem)
T.copy(fn_smem, fn_frag)
for jj in T.serial(hidden_block // 16):
for i, j in T.Parallel(token_block, 16):
v = x_frag[i, jj * 16 + j]
sq_part4[i, j] += v * v
T.gemm(
x_frag,
fn_frag,
out_frag,
transpose_A=False,
transpose_B=True,
k_pack=2,
policy=T.GemmWarpPolicy.FullRow,
use_tf32=True,
)
sq_l = T.alloc_fragment((token_block,), T.float32)
T.reduce_sum(sq_part4, sq_l)
out_shared = T.alloc_shared((token_block, 32), T.float32)
T.annotate_layout({out_shared: tilelang.layout.make_hcu_swizzled_layout(out_shared, major_pack=2)})
T.copy(out_frag, out_shared)
for i in T.Parallel(token_block):
t = px * token_block + i
if t < num_tokens:
sqrsum_partial[bz, t] = sq_l[i]
for i, j in T.Parallel(token_block, 32):
t = px * token_block + i
if t < num_tokens and j < mhc_mult3:
out_partial[bz, t, j] = out_shared[i, j]
@tilelang.jit
def mhc_pre_gemm_sqrsum_splitk_stage_1(
out_partial: T.Tensor[(split_k, num_tokens, 32), T.float32],
sqrsum_partial: T.Tensor[(split_k, num_tokens), T.float32],
out: T.Tensor[(num_tokens, mhc_mult3), T.float32],
sqrsum: T.Tensor[(num_tokens,), T.float32],
):
warps_per_cta = threads // 64
num_reduce = T.ceildiv(split_k, 64)
with T.Kernel(T.ceildiv(num_tokens, warps_per_cta), threads=threads) as (px,):
tx = T.get_thread_binding()
warp = tx // 64
lane = tx % 64
t = px * warps_per_cta + warp
s = T.alloc_local((1,), T.float32)
acc = T.alloc_local((1,), T.float32)
s[0] = 0
acc[0] = 0
if t < num_tokens:
for r in T.serial(num_reduce):
bz = r * 64 + lane
s[0] += T.if_then_else(bz < split_k, sqrsum_partial[bz, t], 0.0)
sqrsum[t] = T.warp_reduce_sum(s[0])
if lane < mhc_mult3:
for bz in T.serial(split_k):
acc[0] += out_partial[bz, t, lane]
out[t, lane] = acc[0]
return (
mhc_pre_gemm_sqrsum_splitk_stage_0,
mhc_pre_gemm_sqrsum_splitk_stage_1,
)
{
"config": {
"(8, 192, 128, False, True, True, True)": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 2
},
"(8, 192, 128, True, True, True, True)": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 2
},
"(16, 192, 128, False, True, False, False)": {
"BLOCK_M": 64,
"BLOCK_N": 32,
"waves_per_eu": 1,
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 1
},
"(16, 192, 128, True, True, False, False)": {
"BLOCK_M": 64,
"BLOCK_N": 64,
"waves_per_eu": 1,
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 1
}
},
"path": {}
}
{ {
"config": { "config": {
"(8, 192, 128, False, True, True, 128)": { "(8, 192, 128, False, True, True, True)": {
"BLOCK_M": 32, "BLOCK_M": 32,
"BLOCK_N": 64, "BLOCK_N": 64,
"waves_per_eu": 1, "waves_per_eu": 1,
...@@ -11,7 +11,29 @@ ...@@ -11,7 +11,29 @@
"num_ctas": 1, "num_ctas": 1,
"num_stages": 1 "num_stages": 1
}, },
"(16, 192, 128, False, True, False, -1)": { "(8, 192, 128, True, True, True, True)": {
"BLOCK_M": 32,
"BLOCK_N": 64,
"waves_per_eu": 1,
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 1
},
"(16, 192, 128, False, True, False, False)": {
"BLOCK_M": 32,
"BLOCK_N": 64,
"waves_per_eu": 1,
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 1
},
"(16, 192, 128, True, True, False, False)": {
"BLOCK_M": 32, "BLOCK_M": 32,
"BLOCK_N": 64, "BLOCK_N": 64,
"waves_per_eu": 1, "waves_per_eu": 1,
......
{
"config": {
"default": {
"BK": 128,
"BV": 64,
"num_warps": 4,
"num_stages": 2
},
"K=128,V=128,BT=64": {
"BK": 128,
"BV": 64,
"num_warps": 4,
"num_stages": 2
},
"K=128,V=128,BT=32": {
"BK": 128,
"BV": 64,
"num_warps": 4,
"num_stages": 2
}
}
}
{
"config": {
"default": {
"BK": 128,
"BV": 64,
"num_warps": 4,
"num_stages": 2
},
"K=128,V=128,BT=64": {
"BK": 128,
"BV": 64,
"num_warps": 4,
"num_stages": 2
},
"K=128,V=128,BT=32": {
"BK": 128,
"BV": 64,
"num_warps": 4,
"num_stages": 2
}
}
}
{
"config": {
"default": {
"BV": 32,
"num_warps": 8,
"num_stages": 2
},
"K=128,V=128,BT=64,H=8": {
"BV": 16,
"num_warps": 4,
"num_stages": 2
},
"K=128,V=128,BT=32,H=8": {
"BV": 32,
"num_warps": 8,
"num_stages": 2
}
}
}
{
"config": {
"default": {
"BV": 32,
"num_warps": 8,
"num_stages": 2
},
"K=128,V=128,BT=64,H=8": {
"BV": 16,
"num_warps": 4,
"num_stages": 2
},
"K=128,V=128,BT=32,H=8": {
"BV": 32,
"num_warps": 8,
"num_stages": 2
}
}
}
{
"key": [
"batch_size",
"kv_group_num",
"Lq",
"Lv",
"USE_CUSTOM_MASK",
"IS_CAUSAL",
"SKIP_PREFIX_CUSTOM_MASK",
"HAS_SINK",
"SLIDING_WINDOW_SIZE",
"xai_temperature_len",
"Q_Extend",
"K_Extend",
"V_Extend",
"O_Extend",
"K_Buffer",
"V_Buffer",
"qo_indptr",
"kv_indptr",
"kv_indices"
],
"config": {
"(1, 16, 192, 128, False, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int64', 'torch.int32', 'torch.int64')": {
"BLOCK_M": 128,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "none",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 2
}
},
"path": {
}
}
\ No newline at end of file
{
"key": [
"batch_size",
"kv_group_num",
"Lq",
"Lv",
"USE_CUSTOM_MASK",
"IS_CAUSAL",
"SKIP_PREFIX_CUSTOM_MASK",
"HAS_SINK",
"SLIDING_WINDOW_SIZE",
"xai_temperature_len",
"Q_Extend",
"K_Extend",
"V_Extend",
"O_Extend",
"K_Buffer",
"V_Buffer",
"qo_indptr",
"kv_indptr",
"kv_indices",
"mask_ptr",
"mask_indptr",
"sink_ptr",
"window_kv_offset_ptr"
],
"config": {
"(32, 8, 192, 128, True, True, True, True, 128, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int64')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 1,
"USE_MLS": false,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 3
},
"(32, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(1, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "none",
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(3, 8, 192, 128, True, True, True, True, 128, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int64')": {
"BLOCK_M": 16,
"BLOCK_N": 64,
"waves_per_eu": 1,
"schedule_hint": "none",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 4,
"num_ctas": 1,
"num_stages": 1
},
"(3, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "none",
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(32, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(1, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16')": {
"BLOCK_M": 16,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "none",
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(3, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16')": {
"BLOCK_M": 16,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "mmac5-ds10",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(32, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int32')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(1, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int32')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
},
"(3, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int32')": {
"BLOCK_M": 32,
"BLOCK_N": 32,
"waves_per_eu": 1,
"schedule_hint": "local-prefetch",
"matrix_instr_nonkdim": 16,
"sched_latency": "none",
"kpack": 2,
"USE_MLS": false,
"num_warps": 2,
"num_ctas": 1,
"num_stages": 3
}
}
}
\ No newline at end of file
{
"config": {
"default": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=1,H=4,HV=16": {
"BV": 16,
"num_warps": 4,
"num_stages": 1
},
"B=2,H=4,HV=16": {
"BV": 16,
"num_warps": 4,
"num_stages": 2
},
"B=4,H=4,HV=16": {
"BV": 32,
"num_warps": 4,
"num_stages": 2
},
"B=8,H=4,HV=16": {
"BV": 32,
"num_warps": 4,
"num_stages": 2
},
"B=16,H=4,HV=16": {
"BV": 32,
"num_warps": 2,
"num_stages": 1
},
"B=32,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 1
},
"B=50,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 1
},
"B=64,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=128,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=256,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
}
}
}
\ No newline at end of file
{
"config": {
"default": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=1,H=4,HV=16": {
"BV": 16,
"num_warps": 4,
"num_stages": 1
},
"B=2,H=4,HV=16": {
"BV": 16,
"num_warps": 4,
"num_stages": 2
},
"B=4,H=4,HV=16": {
"BV": 32,
"num_warps": 4,
"num_stages": 2
},
"B=8,H=4,HV=16": {
"BV": 32,
"num_warps": 4,
"num_stages": 2
},
"B=16,H=4,HV=16": {
"BV": 32,
"num_warps": 2,
"num_stages": 1
},
"B=32,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 1
},
"B=50,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=64,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=128,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
},
"B=256,H=4,HV=16": {
"BV": 32,
"num_warps": 1,
"num_stages": 2
}
}
}
{
"config": {
"default": {
"BV": 32,
"num_warps": 1
},
"T=4,H=4,HV=16": {
"BV": 32,
"num_warps": 4
},
"T=8,H=4,HV=16": {
"BV": 32,
"num_warps": 4
},
"T=16,H=4,HV=16": {
"BV": 64,
"num_warps": 4
},
"T=32,H=4,HV=16": {
"BV": 64,
"num_warps": 4
},
"T=64,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=128,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=256,H=4,HV=16": {
"BV": 32,
"num_warps": 1
}
}
}
\ No newline at end of file
{
"config": {
"default": {
"BV": 32,
"num_warps": 1
},
"T=4,H=4,HV=16": {
"BV": 32,
"num_warps": 4
},
"T=8,H=4,HV=16": {
"BV": 32,
"num_warps": 4
},
"T=16,H=4,HV=16": {
"BV": 64,
"num_warps": 4
},
"T=32,H=4,HV=16": {
"BV": 64,
"num_warps": 4
},
"T=64,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=128,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=256,H=4,HV=16": {
"BV": 32,
"num_warps": 1
}
}
}
\ No newline at end of file
{
"config": {
"default": {
"BV": 32,
"num_warps": 1
},
"T=4,H=4,HV=16": {
"BV": 32,
"num_warps": 4
},
"T=16,H=4,HV=16": {
"BV": 16,
"num_warps": 1
},
"T=32,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=64,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=128,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=192,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=4,H=2,HV=8": {
"BV": 16,
"num_warps": 4
},
"T=16,H=2,HV=8": {
"BV": 32,
"num_warps": 4
},
"T=32,H=2,HV=8": {
"BV": 16,
"num_warps": 1
},
"T=64,H=2,HV=8": {
"BV": 16,
"num_warps": 1
},
"T=128,H=2,HV=8": {
"BV": 16,
"num_warps": 1
},
"T=192,H=2,HV=8": {
"BV": 16,
"num_warps": 1
}
}
}
{
"config": {
"default": {
"BV": 32,
"num_warps": 1
},
"T=4,H=4,HV=16": {
"BV": 32,
"num_warps": 4
},
"T=16,H=4,HV=16": {
"BV": 16,
"num_warps": 1
},
"T=32,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=64,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=128,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=192,H=4,HV=16": {
"BV": 32,
"num_warps": 1
},
"T=4,H=2,HV=8": {
"BV": 16,
"num_warps": 4
},
"T=16,H=2,HV=8": {
"BV": 32,
"num_warps": 4
},
"T=32,H=2,HV=8": {
"BV": 16,
"num_warps": 1
},
"T=64,H=2,HV=8": {
"BV": 16,
"num_warps": 1
},
"T=128,H=2,HV=8": {
"BV": 16,
"num_warps": 1
},
"T=192,H=2,HV=8": {
"BV": 16,
"num_warps": 1
}
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 512,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 512,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 512,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 512,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"8192": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"16384": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
}
}
\ No newline at end of file
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"8192": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 8,
"num_stages": 2
},
"16384": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
}
}
\ No newline at end of file
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"8192": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"16384": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
}
}
\ No newline at end of file
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 1
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"8192": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 1
},
"16384": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": true,
"instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 8,
"num_stages": 1
}
}
\ No newline at end of file
...@@ -3,110 +3,143 @@ ...@@ -3,110 +3,143 @@
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 2 "num_stages": 2
}, },
"2": { "2": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 16, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 2 "num_stages": 2
}, },
"4": { "4": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 1 "num_stages": 2
}, },
"8": { "8": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"instruction_sched_variant": "none", "USE_MLS_LOAD": false,
"num_warps": 8, "instruction_sched_variant": "local-prefetch",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2 "num_stages": 2
}, },
"16": { "16": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"instruction_sched_variant": "none", "USE_MLS_LOAD": false,
"num_warps": 8, "instruction_sched_variant": "local-prefetch",
"sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2 "num_stages": 2
}, },
"24": { "24": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 8, "sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_stages": 2 "num_stages": 2
}, },
"32": { "32": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 8, "sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_stages": 2 "num_stages": 2
}, },
"64": { "64": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 8, "sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_stages": 2 "num_stages": 2
}, },
"128": { "128": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 8, "sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 4,
"num_stages": 2 "num_stages": 2
}, },
"256": { "256": {
"BLOCK_SIZE_M": 16, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 8, "sched_latency": "none",
"kpack": 1,
"num_warps": 2,
"num_stages": 2 "num_stages": 2
}, },
"512": { "512": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 2, "sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2 "num_stages": 2
}, },
"1024": { "1024": {
...@@ -115,48 +148,76 @@ ...@@ -115,48 +148,76 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 8, "sched_latency": "none",
"kpack": 1,
"num_warps": 4,
"num_stages": 2 "num_stages": 2
}, },
"2048": { "2048": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 2 "num_stages": 2
}, },
"4096": { "4096": {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"sched_latency": "none",
"kpack": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 2 "num_stages": 2
}, },
"8192": { "8192": {
"BLOCK_SIZE_M": 32, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": true,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 4, "sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 8,
"num_stages": 2 "num_stages": 2
}, },
"16384": { "16384": {
"BLOCK_SIZE_M": 32, "BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64, "BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": true, "COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none", "instruction_sched_variant": "none",
"num_warps": 4, "sched_latency": "none",
"num_stages": 2 "kpack": 1,
"num_warps": 8,
"num_stages": 1
},
"32768": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"COMBINE_SCALE_LOAD": false,
"USE_MLS_LOAD": false,
"instruction_sched_variant": "none",
"sched_latency": "mmac5-ds10",
"kpack": 1,
"num_warps": 8,
"num_stages": 1
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment