Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
aiter
Commits
bb596f6e
Commit
bb596f6e
authored
Jun 04, 2026
by
xiaowei.zhang
Browse files
1. Update MOE; 2. Update sglang mHC; 3. Update test scripts; 4 Add new
ops.
parent
d9ebb683
Changes
243
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1750 additions
and
50 deletions
+1750
-50
aiter/ops/tilelang/mhc/pre_norm_fn_splitk_kernel.py
aiter/ops/tilelang/mhc/pre_norm_fn_splitk_kernel.py
+132
-0
aiter/ops/triton/configs/BW200B-EXTEND_ATTENTION-V2-DECODE-FP16.json
...riton/configs/BW200B-EXTEND_ATTENTION-V2-DECODE-FP16.json
+49
-0
aiter/ops/triton/configs/BW200B-EXTEND_ATTENTION-V2-FP16.json
...r/ops/triton/configs/BW200B-EXTEND_ATTENTION-V2-FP16.json
+24
-2
aiter/ops/triton/configs/chunk_fwd_o/chunk_fwd_o-gfx936.json
aiter/ops/triton/configs/chunk_fwd_o/chunk_fwd_o-gfx936.json
+22
-0
aiter/ops/triton/configs/chunk_fwd_o/chunk_fwd_o-gfx938.json
aiter/ops/triton/configs/chunk_fwd_o/chunk_fwd_o-gfx938.json
+22
-0
aiter/ops/triton/configs/chunk_gated_delta_rule_fwd_h/chunk_gated_delta_rule_fwd_h-gfx936.json
...delta_rule_fwd_h/chunk_gated_delta_rule_fwd_h-gfx936.json
+19
-0
aiter/ops/triton/configs/chunk_gated_delta_rule_fwd_h/chunk_gated_delta_rule_fwd_h-gfx938.json
...delta_rule_fwd_h/chunk_gated_delta_rule_fwd_h-gfx938.json
+19
-0
aiter/ops/triton/configs/extend_attn/_fwd_kernel_v2-device=gfx938_cu72.json
...onfigs/extend_attn/_fwd_kernel_v2-device=gfx938_cu72.json
+40
-0
aiter/ops/triton/configs/extend_attn/_fwd_kernel_v2_decode-device=gfx938_cu72.json
...extend_attn/_fwd_kernel_v2_decode-device=gfx938_cu72.json
+172
-0
aiter/ops/triton/configs/fused_recurrent_gated_delta_rule_packed_decode/fused_recurrent_gated_delta_rule_packed_decode-gfx936.json
...used_recurrent_gated_delta_rule_packed_decode-gfx936.json
+59
-0
aiter/ops/triton/configs/fused_recurrent_gated_delta_rule_packed_decode/fused_recurrent_gated_delta_rule_packed_decode-gfx938.json
...used_recurrent_gated_delta_rule_packed_decode-gfx938.json
+59
-0
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update/fused_sigmoid_gating_delta_rule_update-gfx936.json
...update/fused_sigmoid_gating_delta_rule_update-gfx936.json
+36
-0
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update/fused_sigmoid_gating_delta_rule_update-gfx938.json
...update/fused_sigmoid_gating_delta_rule_update-gfx938.json
+36
-0
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update_recurrent/fused_sigmoid_gating_delta_rule_update_recurrent-gfx936.json
...ed_sigmoid_gating_delta_rule_update_recurrent-gfx936.json
+56
-0
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update_recurrent/fused_sigmoid_gating_delta_rule_update_recurrent-gfx938.json
...ed_sigmoid_gating_delta_rule_update_recurrent-gfx938.json
+56
-0
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True.json
...320,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True.json
+210
-0
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=fp8_w8a8.json
...gs/moe/E=160,N=320,device_name=BW200B,dtype=fp8_w8a8.json
+210
-0
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json
...20,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json
+210
-0
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=int8_w8a8.json
...s/moe/E=160,N=320,device_name=BW200B,dtype=int8_w8a8.json
+210
-0
aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json
...e_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json
+109
-48
No files found.
Too many changes to show.
To preserve performance only
243 of 243+
files are displayed.
Plain diff
Email patch
aiter/ops/tilelang/mhc/pre_norm_fn_splitk_kernel.py
0 → 100644
View file @
bb596f6e
import
functools
from
typing
import
Tuple
import
tilelang
from
tilelang
import
language
as
T
_PASS_CONFIGS
=
{
tilelang
.
PassConfigKey
.
TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE
:
True
,
tilelang
.
PassConfigKey
.
TL_ENABLE_FAST_MATH
:
True
,
}
@
functools
.
cache
def
mhc_pre_gemm_sqrsum_splitk_kernel
(
mhc_mult3
:
int
,
mhc_hidden_size
:
int
,
split_k
:
int
,
token_block
:
int
=
64
,
hidden_block
:
int
=
256
,
threads
:
int
=
256
,
)
->
Tuple
[
tilelang
.
JITKernel
,
tilelang
.
JITKernel
]:
assert
mhc_mult3
<=
32
assert
mhc_hidden_size
%
hidden_block
==
0
assert
mhc_hidden_size
%
split_k
==
0
split_size
=
mhc_hidden_size
//
split_k
assert
split_size
%
hidden_block
==
0
num_tokens
=
T
.
dynamic
(
"num_tokens"
)
@
tilelang
.
jit
(
pass_configs
=
_PASS_CONFIGS
)
def
mhc_pre_gemm_sqrsum_splitk_stage_0
(
x
:
T
.
Tensor
[(
num_tokens
,
mhc_hidden_size
),
T
.
bfloat16
],
fn
:
T
.
Tensor
[(
mhc_mult3
,
mhc_hidden_size
),
T
.
float32
],
out_partial
:
T
.
Tensor
[(
split_k
,
num_tokens
,
mhc_mult3
),
T
.
float32
],
sqrsum_partial
:
T
.
Tensor
[(
split_k
,
num_tokens
),
T
.
float32
],
):
with
T
.
Kernel
(
split_k
,
T
.
ceildiv
(
num_tokens
,
token_block
),
threads
=
threads
)
as
(
bz
,
px
,
):
out_frag
=
T
.
alloc_fragment
((
token_block
,
32
),
T
.
float32
)
sq_part4
=
T
.
alloc_fragment
((
token_block
,
16
),
T
.
float32
)
T
.
clear
(
out_frag
)
T
.
clear
(
sq_part4
)
k_base
=
bz
*
split_size
for
pz
in
T
.
Pipelined
(
split_size
//
hidden_block
,
num_stages
=
0
):
x_frag_pre
=
T
.
alloc_fragment
((
token_block
,
hidden_block
),
T
.
bfloat16
)
fn_frag_pre
=
T
.
alloc_fragment
((
32
,
hidden_block
),
T
.
float32
)
x_frag_16
=
T
.
alloc_fragment
((
token_block
,
hidden_block
),
T
.
bfloat16
)
x_frag
=
T
.
alloc_fragment
((
token_block
,
hidden_block
),
T
.
float32
)
fn_frag
=
T
.
alloc_fragment
((
32
,
hidden_block
),
T
.
float32
)
x_smem_16
=
T
.
alloc_shared
((
token_block
,
hidden_block
),
T
.
bfloat16
)
fn_smem
=
T
.
alloc_shared
((
32
,
hidden_block
),
T
.
float32
)
T
.
annotate_layout
({
x_smem_16
:
tilelang
.
layout
.
make_hcu_swizzled_layout
(
x_smem_16
,
major_pack
=
2
)})
T
.
annotate_layout
({
fn_smem
:
tilelang
.
layout
.
make_hcu_swizzled_layout
(
fn_smem
,
major_pack
=
2
)})
T
.
copy
(
x
[
px
*
token_block
,
k_base
+
pz
*
hidden_block
],
x_frag_pre
)
T
.
copy
(
fn
[
0
,
k_base
+
pz
*
hidden_block
],
fn_frag_pre
)
T
.
copy
(
x_frag_pre
,
x_smem_16
)
T
.
copy
(
x_smem_16
,
x_frag_16
)
T
.
copy
(
x_frag_16
,
x_frag
)
T
.
copy
(
fn_frag_pre
,
fn_smem
)
T
.
copy
(
fn_smem
,
fn_frag
)
for
jj
in
T
.
serial
(
hidden_block
//
16
):
for
i
,
j
in
T
.
Parallel
(
token_block
,
16
):
v
=
x_frag
[
i
,
jj
*
16
+
j
]
sq_part4
[
i
,
j
]
+=
v
*
v
T
.
gemm
(
x_frag
,
fn_frag
,
out_frag
,
transpose_A
=
False
,
transpose_B
=
True
,
k_pack
=
2
,
policy
=
T
.
GemmWarpPolicy
.
FullRow
,
use_tf32
=
True
,
)
sq_l
=
T
.
alloc_fragment
((
token_block
,),
T
.
float32
)
T
.
reduce_sum
(
sq_part4
,
sq_l
)
out_shared
=
T
.
alloc_shared
((
token_block
,
32
),
T
.
float32
)
T
.
annotate_layout
({
out_shared
:
tilelang
.
layout
.
make_hcu_swizzled_layout
(
out_shared
,
major_pack
=
2
)})
T
.
copy
(
out_frag
,
out_shared
)
for
i
in
T
.
Parallel
(
token_block
):
t
=
px
*
token_block
+
i
if
t
<
num_tokens
:
sqrsum_partial
[
bz
,
t
]
=
sq_l
[
i
]
for
i
,
j
in
T
.
Parallel
(
token_block
,
32
):
t
=
px
*
token_block
+
i
if
t
<
num_tokens
and
j
<
mhc_mult3
:
out_partial
[
bz
,
t
,
j
]
=
out_shared
[
i
,
j
]
@
tilelang
.
jit
def
mhc_pre_gemm_sqrsum_splitk_stage_1
(
out_partial
:
T
.
Tensor
[(
split_k
,
num_tokens
,
32
),
T
.
float32
],
sqrsum_partial
:
T
.
Tensor
[(
split_k
,
num_tokens
),
T
.
float32
],
out
:
T
.
Tensor
[(
num_tokens
,
mhc_mult3
),
T
.
float32
],
sqrsum
:
T
.
Tensor
[(
num_tokens
,),
T
.
float32
],
):
warps_per_cta
=
threads
//
64
num_reduce
=
T
.
ceildiv
(
split_k
,
64
)
with
T
.
Kernel
(
T
.
ceildiv
(
num_tokens
,
warps_per_cta
),
threads
=
threads
)
as
(
px
,):
tx
=
T
.
get_thread_binding
()
warp
=
tx
//
64
lane
=
tx
%
64
t
=
px
*
warps_per_cta
+
warp
s
=
T
.
alloc_local
((
1
,),
T
.
float32
)
acc
=
T
.
alloc_local
((
1
,),
T
.
float32
)
s
[
0
]
=
0
acc
[
0
]
=
0
if
t
<
num_tokens
:
for
r
in
T
.
serial
(
num_reduce
):
bz
=
r
*
64
+
lane
s
[
0
]
+=
T
.
if_then_else
(
bz
<
split_k
,
sqrsum_partial
[
bz
,
t
],
0.0
)
sqrsum
[
t
]
=
T
.
warp_reduce_sum
(
s
[
0
])
if
lane
<
mhc_mult3
:
for
bz
in
T
.
serial
(
split_k
):
acc
[
0
]
+=
out_partial
[
bz
,
t
,
lane
]
out
[
t
,
lane
]
=
acc
[
0
]
return
(
mhc_pre_gemm_sqrsum_splitk_stage_0
,
mhc_pre_gemm_sqrsum_splitk_stage_1
,
)
aiter/ops/triton/configs/BW200B-EXTEND_ATTENTION-V2-DECODE-FP16.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"(8, 192, 128, False, True, True, True)"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
2
},
"(8, 192, 128, True, True, True, True)"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
2
},
"(16, 192, 128, False, True, False, False)"
:
{
"BLOCK_M"
:
64
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
1
},
"(16, 192, 128, True, True, False, False)"
:
{
"BLOCK_M"
:
64
,
"BLOCK_N"
:
64
,
"waves_per_eu"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
1
}
},
"path"
:
{}
}
aiter/ops/triton/configs/BW200B-EXTEND_ATTENTION-V2-FP16.json
View file @
bb596f6e
{
"config"
:
{
"(8, 192, 128, False, True, True,
128
)"
:
{
"(8, 192, 128, False, True, True,
True
)"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
64
,
"waves_per_eu"
:
1
,
...
...
@@ -11,7 +11,29 @@
"num_ctas"
:
1
,
"num_stages"
:
1
},
"(16, 192, 128, False, True, False, -1)"
:
{
"(8, 192, 128, True, True, True, True)"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
64
,
"waves_per_eu"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
1
},
"(16, 192, 128, False, True, False, False)"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
64
,
"waves_per_eu"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
1
},
"(16, 192, 128, True, True, False, False)"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
64
,
"waves_per_eu"
:
1
,
...
...
aiter/ops/triton/configs/chunk_fwd_o/chunk_fwd_o-gfx936.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BK"
:
128
,
"BV"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"K=128,V=128,BT=64"
:
{
"BK"
:
128
,
"BV"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"K=128,V=128,BT=32"
:
{
"BK"
:
128
,
"BV"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
}
aiter/ops/triton/configs/chunk_fwd_o/chunk_fwd_o-gfx938.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BK"
:
128
,
"BV"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"K=128,V=128,BT=64"
:
{
"BK"
:
128
,
"BV"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"K=128,V=128,BT=32"
:
{
"BK"
:
128
,
"BV"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
}
aiter/ops/triton/configs/chunk_gated_delta_rule_fwd_h/chunk_gated_delta_rule_fwd_h-gfx936.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"K=128,V=128,BT=64,H=8"
:
{
"BV"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"K=128,V=128,BT=32,H=8"
:
{
"BV"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
}
}
}
aiter/ops/triton/configs/chunk_gated_delta_rule_fwd_h/chunk_gated_delta_rule_fwd_h-gfx938.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"K=128,V=128,BT=64,H=8"
:
{
"BV"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"K=128,V=128,BT=32,H=8"
:
{
"BV"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
2
}
}
}
aiter/ops/triton/configs/extend_attn/_fwd_kernel_v2-device=gfx938_cu72.json
0 → 100644
View file @
bb596f6e
{
"key"
:
[
"batch_size"
,
"kv_group_num"
,
"Lq"
,
"Lv"
,
"USE_CUSTOM_MASK"
,
"IS_CAUSAL"
,
"SKIP_PREFIX_CUSTOM_MASK"
,
"HAS_SINK"
,
"SLIDING_WINDOW_SIZE"
,
"xai_temperature_len"
,
"Q_Extend"
,
"K_Extend"
,
"V_Extend"
,
"O_Extend"
,
"K_Buffer"
,
"V_Buffer"
,
"qo_indptr"
,
"kv_indptr"
,
"kv_indices"
],
"config"
:
{
"(1, 16, 192, 128, False, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int64', 'torch.int32', 'torch.int64')"
:
{
"BLOCK_M"
:
128
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"none"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
2
}
},
"path"
:
{
}
}
\ No newline at end of file
aiter/ops/triton/configs/extend_attn/_fwd_kernel_v2_decode-device=gfx938_cu72.json
0 → 100644
View file @
bb596f6e
{
"key"
:
[
"batch_size"
,
"kv_group_num"
,
"Lq"
,
"Lv"
,
"USE_CUSTOM_MASK"
,
"IS_CAUSAL"
,
"SKIP_PREFIX_CUSTOM_MASK"
,
"HAS_SINK"
,
"SLIDING_WINDOW_SIZE"
,
"xai_temperature_len"
,
"Q_Extend"
,
"K_Extend"
,
"V_Extend"
,
"O_Extend"
,
"K_Buffer"
,
"V_Buffer"
,
"qo_indptr"
,
"kv_indptr"
,
"kv_indices"
,
"mask_ptr"
,
"mask_indptr"
,
"sink_ptr"
,
"window_kv_offset_ptr"
],
"config"
:
{
"(32, 8, 192, 128, True, True, True, True, 128, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int64')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"USE_MLS"
:
false
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(32, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(1, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"none"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(3, 8, 192, 128, True, True, True, True, 128, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int64')"
:
{
"BLOCK_M"
:
16
,
"BLOCK_N"
:
64
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"none"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
4
,
"num_ctas"
:
1
,
"num_stages"
:
1
},
"(3, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"none"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(32, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(1, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16')"
:
{
"BLOCK_M"
:
16
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"none"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(3, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16')"
:
{
"BLOCK_M"
:
16
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(32, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int32')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(1, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int32')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
},
"(3, 16, 192, 128, True, True, True, False, -1, -1, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int64', 'torch.bool', 'torch.int64', 'torch.bfloat16', 'torch.int32')"
:
{
"BLOCK_M"
:
32
,
"BLOCK_N"
:
32
,
"waves_per_eu"
:
1
,
"schedule_hint"
:
"local-prefetch"
,
"matrix_instr_nonkdim"
:
16
,
"sched_latency"
:
"none"
,
"kpack"
:
2
,
"USE_MLS"
:
false
,
"num_warps"
:
2
,
"num_ctas"
:
1
,
"num_stages"
:
3
}
}
}
\ No newline at end of file
aiter/ops/triton/configs/fused_recurrent_gated_delta_rule_packed_decode/fused_recurrent_gated_delta_rule_packed_decode-gfx936.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=1,H=4,HV=16"
:
{
"BV"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"B=2,H=4,HV=16"
:
{
"BV"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"B=4,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"B=8,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"B=16,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"B=32,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
1
},
"B=50,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
1
},
"B=64,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=128,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=256,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
}
}
}
\ No newline at end of file
aiter/ops/triton/configs/fused_recurrent_gated_delta_rule_packed_decode/fused_recurrent_gated_delta_rule_packed_decode-gfx938.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=1,H=4,HV=16"
:
{
"BV"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"B=2,H=4,HV=16"
:
{
"BV"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"B=4,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"B=8,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"B=16,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"B=32,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
1
},
"B=50,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=64,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=128,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
},
"B=256,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
,
"num_stages"
:
2
}
}
}
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update/fused_sigmoid_gating_delta_rule_update-gfx936.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=4,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=8,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=16,H=4,HV=16"
:
{
"BV"
:
64
,
"num_warps"
:
4
},
"T=32,H=4,HV=16"
:
{
"BV"
:
64
,
"num_warps"
:
4
},
"T=64,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=128,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=256,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
}
}
}
\ No newline at end of file
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update/fused_sigmoid_gating_delta_rule_update-gfx938.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=4,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=8,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=16,H=4,HV=16"
:
{
"BV"
:
64
,
"num_warps"
:
4
},
"T=32,H=4,HV=16"
:
{
"BV"
:
64
,
"num_warps"
:
4
},
"T=64,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=128,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=256,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
}
}
}
\ No newline at end of file
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update_recurrent/fused_sigmoid_gating_delta_rule_update_recurrent-gfx936.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=4,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=16,H=4,HV=16"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=32,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=64,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=128,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=192,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=4,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
4
},
"T=16,H=2,HV=8"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=32,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=64,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=128,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=192,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
}
}
}
aiter/ops/triton/configs/fused_sigmoid_gating_delta_rule_update_recurrent/fused_sigmoid_gating_delta_rule_update_recurrent-gfx938.json
0 → 100644
View file @
bb596f6e
{
"config"
:
{
"default"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=4,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=16,H=4,HV=16"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=32,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=64,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=128,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=192,H=4,HV=16"
:
{
"BV"
:
32
,
"num_warps"
:
1
},
"T=4,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
4
},
"T=16,H=2,HV=8"
:
{
"BV"
:
32
,
"num_warps"
:
4
},
"T=32,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=64,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=128,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
},
"T=192,H=2,HV=8"
:
{
"BV"
:
16
,
"num_warps"
:
1
}
}
}
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True.json
0 → 100644
View file @
bb596f6e
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
512
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
512
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
512
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
512
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"8192"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"16384"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
}
}
\ No newline at end of file
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=fp8_w8a8.json
0 → 100644
View file @
bb596f6e
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"8192"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"16384"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
}
}
\ No newline at end of file
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json
0 → 100644
View file @
bb596f6e
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"8192"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"16384"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
}
}
\ No newline at end of file
aiter/ops/triton/configs/moe/E=160,N=320,device_name=BW200B,dtype=int8_w8a8.json
0 → 100644
View file @
bb596f6e
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"8192"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
},
"16384"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
1
}
}
\ No newline at end of file
aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json
View file @
bb596f6e
...
...
@@ -3,110 +3,143 @@
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
8
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
1
"num_stages"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"local-prefetch"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"local-prefetch"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
16
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
2
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"1024"
:
{
...
...
@@ -115,48 +148,76 @@
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
8
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"8192"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
true
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
4
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
},
"16384"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
6
4
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
25
6
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"COMBINE_SCALE_LOAD"
:
true
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"num_warps"
:
4
,
"num_stages"
:
2
"sched_latency"
:
"none"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
1
},
"32768"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"COMBINE_SCALE_LOAD"
:
false
,
"USE_MLS_LOAD"
:
false
,
"instruction_sched_variant"
:
"none"
,
"sched_latency"
:
"mmac5-ds10"
,
"kpack"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
1
}
}
\ No newline at end of file
Prev
1
…
4
5
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment