Commit bb596f6e authored by xiaowei.zhang's avatar xiaowei.zhang
Browse files

1. Update MOE; 2. Update sglang mHC; 3. Update test scripts; 4 Add new

   ops.
parent d9ebb683
[submodule "3rdparty/composable_kernel"]
path = 3rdparty/composable_kernel
url = ../composable_kernel.git
branch = main
url = ../composable_kernel
branch = rel-5.7.1
[submodule "3rdparty/moe_c"]
path = 3rdparty/moe_c
url = ../moe.git
branch = master
url = ../Moe
branch = W8A8
Subproject commit 8d05eec5aa99d5fa0cc5f5ef372a2ce02036bb73
Subproject commit a3b6d4d4825e8cf1b29160b9aa5ff8dbea08c8ea
......@@ -71,6 +71,8 @@ from .ops.rope import *
from .ops.topk import *
# from .ops.mha import *
from .ops.gradlib import *
from .ops.mhc import *
from .ops.grouped_gemm import *
# from .ops.trans_ragged_layout import *
# from . import mla
from .utility import dtypes,fp4_utils
......@@ -141,6 +141,57 @@ gfx938,no_quant,torch.float16,32768,352,4096,129,9,0,0,asm,13001+23001,19058.723
gfx938,no_quant,torch.float16,40960,352,4096,129,9,0,0,asm,13001+23001,23722.1115
gfx938,no_quant,torch.float16,49152,352,4096,129,9,0,0,asm,13001+23001,28329.0767
gfx938,no_quant,torch.float16,65536,352,4096,129,9,0,0,asm,13001+23001,37562.0269
gfx938,no_quant,torch.bfloat16,1,192,2048,128,8,0,0,asm,10006+20000,36.1297
gfx938,no_quant,torch.bfloat16,2,192,2048,128,8,0,0,asm,10008+20000,47.5655
gfx938,no_quant,torch.bfloat16,4,192,2048,128,8,0,0,asm,10011+20000,70.0834
gfx938,no_quant,torch.bfloat16,8,192,2048,128,8,0,0,asm,10002+20000,107.3634
gfx938,no_quant,torch.bfloat16,16,192,2048,128,8,0,0,asm,10008+20000,153.1485
gfx938,no_quant,torch.bfloat16,32,192,2048,128,8,0,0,asm,10008+20000,194.9505
gfx938,no_quant,torch.bfloat16,64,192,2048,128,8,0,0,asm,10002+20000,227.6915
gfx938,no_quant,torch.bfloat16,128,192,2048,128,8,0,0,asm,10002+20000,217.1789
gfx938,no_quant,torch.bfloat16,256,192,2048,128,8,0,0,asm,11004+21001,243.1747
gfx938,no_quant,torch.bfloat16,512,192,2048,128,8,0,0,asm,12000+22001,297.8012
gfx938,no_quant,torch.bfloat16,1024,192,2048,128,8,0,0,asm,12000+22001,411.3837
gfx938,no_quant,torch.bfloat16,2048,192,2048,128,8,0,0,asm,13000+23001,604.4119
gfx938,no_quant,torch.bfloat16,4096,192,2048,128,8,0,0,asm,13001+23001,955.5858
gfx938,no_quant,torch.bfloat16,8192,192,2048,128,8,0,0,asm,13001+23001,1689.6308
gfx938,no_quant,torch.bfloat16,16384,192,2048,128,8,0,0,asm,13001+23001,3207.5057
gfx938,no_quant,torch.bfloat16,32768,192,2048,128,8,0,0,asm,13001+23001,6173.8209
gfx938,no_quant,torch.bfloat16,65536,192,2048,128,8,0,0,asm,13000+23001,12477.2685
gfx938,no_quant,torch.bfloat16,1,384,2048,128,8,0,0,asm,10008+20001,47.3044
gfx938,no_quant,torch.bfloat16,2,384,2048,128,8,0,0,asm,10011+20002,69.216
gfx938,no_quant,torch.bfloat16,4,384,2048,128,8,0,0,asm,10002+20000,113.4772
gfx938,no_quant,torch.bfloat16,8,384,2048,128,8,0,0,asm,10008+20000,191.1611
gfx938,no_quant,torch.bfloat16,16,384,2048,128,8,0,0,asm,10011+20000,286.5208
gfx938,no_quant,torch.bfloat16,32,384,2048,128,8,0,0,asm,10011+20000,357.0469
gfx938,no_quant,torch.bfloat16,64,384,2048,128,8,0,0,asm,10002+20000,415.0259
gfx938,no_quant,torch.bfloat16,128,384,2048,128,8,0,0,asm,10008+20000,412.5307
gfx938,no_quant,torch.bfloat16,256,384,2048,128,8,0,0,asm,11004+21001,450.3575
gfx938,no_quant,torch.bfloat16,512,384,2048,128,8,0,0,asm,12001+22001,491.7812
gfx938,no_quant,torch.bfloat16,1024,384,2048,128,8,0,0,asm,13001+23001,597.3638
gfx938,no_quant,torch.bfloat16,2048,384,2048,128,8,0,0,asm,13001+23001,767.0737
gfx938,no_quant,torch.bfloat16,4096,384,2048,128,8,0,0,asm,13001+23001,1276.6764
gfx938,no_quant,torch.bfloat16,8192,384,2048,128,8,0,0,asm,13001+23001,2246.1413
gfx938,no_quant,torch.bfloat16,16384,384,2048,128,8,0,0,asm,13001+23001,4450.9009
gfx938,no_quant,torch.bfloat16,32768,384,2048,128,8,0,0,asm,13001+23001,8575.6606
gfx938,no_quant,torch.bfloat16,65536,384,2048,128,8,0,0,asm,13001+23001,16846.3634
gfx938,no_quant,torch.bfloat16,1,768,2048,128,8,0,0,asm,10011+20000,68.8792
gfx938,no_quant,torch.bfloat16,2,768,2048,128,8,0,0,asm,10008+20000,118.9591
gfx938,no_quant,torch.bfloat16,4,768,2048,128,8,0,0,asm,10008+20001,210.1253
gfx938,no_quant,torch.bfloat16,8,768,2048,128,8,0,0,asm,10011+20001,367.1356
gfx938,no_quant,torch.bfloat16,16,768,2048,128,8,0,0,asm,10002+20001,572.9879
gfx938,no_quant,torch.bfloat16,32,768,2048,128,8,0,0,asm,10002+20000,714.7725
gfx938,no_quant,torch.bfloat16,64,768,2048,128,8,0,0,asm,10002+20000,817.7197
gfx938,no_quant,torch.bfloat16,128,768,2048,128,8,0,0,asm,10011+20000,813.9779
gfx938,no_quant,torch.bfloat16,256,768,2048,128,8,0,0,asm,11005+21001,849.9948
gfx938,no_quant,torch.bfloat16,512,768,2048,128,8,0,0,asm,12001+22001,906.7694
gfx938,no_quant,torch.bfloat16,1024,768,2048,128,8,0,0,asm,13001+23002,1021.0857
gfx938,no_quant,torch.bfloat16,2048,768,2048,128,8,0,0,asm,13001+23001,1381.4244
gfx938,no_quant,torch.bfloat16,4096,768,2048,128,8,0,0,asm,13001+23001,2296.0597
gfx938,no_quant,torch.bfloat16,8192,768,2048,128,8,0,0,asm,13001+23001,4099.5996
gfx938,no_quant,torch.bfloat16,16384,768,2048,128,8,0,0,asm,13001+23001,7791.3597
gfx938,no_quant,torch.bfloat16,32768,768,2048,128,8,0,0,asm,13001+23001,15124.7783
gfx938,no_quant,torch.bfloat16,65536,768,2048,128,8,0,0,asm,13001+23001,29786.7389
gfx936,no_quant,torch.float16,1,256,3072,256,8,0,0,asm,10002+20000,55.456
gfx936,no_quant,torch.float16,2,256,3072,256,8,0,0,asm,10002+20000,86.6223
gfx936,no_quant,torch.float16,4,256,3072,256,8,0,0,asm,10002+20000,155.6412
......@@ -241,3 +292,419 @@ gfx936,no_quant,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,2854.5487
gfx936,no_quant,torch.float16,16384,128,3072,256,8,0,0,asm,13001+23001,3669.7898
gfx936,no_quant,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,5322.7565
gfx936,no_quant,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,7028.0263
gfx936,no_quant,torch.bfloat16,1,384,2048,128,8,0,0,asm,10009+20001,53.9486
gfx936,no_quant,torch.bfloat16,2,384,2048,128,8,0,0,asm,10001+20000,84.7023
gfx936,no_quant,torch.bfloat16,4,384,2048,128,8,0,0,asm,10001+20000,144.3738
gfx936,no_quant,torch.bfloat16,8,384,2048,128,8,0,0,asm,10001+20001,243.5566
gfx936,no_quant,torch.bfloat16,16,384,2048,128,8,0,0,asm,10001+20001,376.5333
gfx936,no_quant,torch.bfloat16,32,384,2048,128,8,0,0,asm,10001+20001,470.0152
gfx936,no_quant,torch.bfloat16,48,384,2048,128,8,0,0,asm,10001+20001,531.1322
gfx936,no_quant,torch.bfloat16,64,384,2048,128,8,0,0,asm,10001+20001,553.4004
gfx936,no_quant,torch.bfloat16,96,384,2048,128,8,0,0,asm,10001+20001,552.4121
gfx936,no_quant,torch.bfloat16,128,384,2048,128,8,0,0,asm,10001+20001,561.5489
gfx936,no_quant,torch.bfloat16,200,384,2048,128,8,0,0,asm,10001+20001,605.4647
gfx936,no_quant,torch.bfloat16,256,384,2048,128,8,0,0,asm,11000+20002,622.9636
gfx936,no_quant,torch.bfloat16,384,384,2048,128,8,0,0,asm,11006+20002,668.1256
gfx936,no_quant,torch.bfloat16,460,384,2048,128,8,0,0,asm,11007+20002,671.6456
gfx936,no_quant,torch.bfloat16,512,384,2048,128,8,0,0,asm,12004+22001,686.9045
gfx936,no_quant,torch.bfloat16,798,384,2048,128,8,0,0,asm,12004+22001,732.6055
gfx936,no_quant,torch.bfloat16,1024,384,2048,128,8,0,0,asm,13001+22001,769.1949
gfx936,no_quant,torch.bfloat16,1280,384,2048,128,8,0,0,asm,13001+22001,801.9444
gfx936,no_quant,torch.bfloat16,1440,384,2048,128,8,0,0,asm,13001+22001,841.9274
gfx936,no_quant,torch.bfloat16,1560,384,2048,128,8,0,0,asm,13001+22001,832.7822
gfx936,no_quant,torch.bfloat16,1880,384,2048,128,8,0,0,asm,13000+23002,849.4395
gfx936,no_quant,torch.bfloat16,2000,384,2048,128,8,0,0,asm,13001+22001,886.6015
gfx936,no_quant,torch.bfloat16,2200,384,2048,128,8,0,0,asm,13001+23001,923.3931
gfx936,no_quant,torch.bfloat16,2400,384,2048,128,8,0,0,asm,13001+22001,983.6457
gfx936,no_quant,torch.bfloat16,2800,384,2048,128,8,0,0,asm,13001+23001,1006.7361999999999
gfx936,no_quant,torch.bfloat16,3200,384,2048,128,8,0,0,asm,13001+23001,1057.7929
gfx936,no_quant,torch.bfloat16,3660,384,2048,128,8,0,0,asm,13001+23001,1118.3403
gfx936,no_quant,torch.bfloat16,4096,384,2048,128,8,0,0,asm,13001+23001,1237.801
gfx936,no_quant,torch.bfloat16,1,1024,4096,512,10,0,0,asm,10001+20000,266.4873
gfx936,no_quant,torch.bfloat16,4,1024,4096,512,10,0,0,asm,10000+20000,927.0086
gfx936,no_quant,torch.bfloat16,16,1024,4096,512,10,0,0,asm,13000+23002,3159.4357
gfx936,no_quant,torch.bfloat16,32,1024,4096,512,10,0,0,asm,13000+23002,5249.8556
gfx936,no_quant,torch.bfloat16,64,1024,4096,512,10,0,0,asm,13000+23002,7941.6251
gfx936,no_quant,torch.bfloat16,128,1024,4096,512,10,0,0,asm,13000+23002,10229.1505
gfx936,no_quant,torch.bfloat16,256,1024,4096,512,10,0,0,asm,13000+23002,11070.6568
gfx936,no_quant,torch.bfloat16,512,1024,4096,512,10,0,0,asm,13000+23002,11400.1216
gfx936,no_quant,torch.bfloat16,1024,1024,4096,512,10,0,0,asm,13000+23002,11766.9253
gfx936,no_quant,torch.bfloat16,2048,1024,4096,512,10,0,0,asm,13000+23002,12317.5013
gfx936,no_quant,torch.bfloat16,4096,1024,4096,512,10,0,0,asm,13000+23002,13405.458
gfx936,no_quant,torch.bfloat16,8192,1024,4096,512,10,0,0,asm,13001+23002,16444.1795
gfx936,no_quant,torch.bfloat16,16384,1024,4096,512,10,0,0,asm,13001+23002,25116.8953
gfx936,no_quant,torch.bfloat16,32768,1024,4096,512,10,0,0,asm,13001+23002,47344.0981
gfx936,no_quant,torch.bfloat16,1,192,4096,128,8,0,0,asm,10002+20000,49.5607
gfx936,no_quant,torch.bfloat16,2,192,4096,128,8,0,0,asm,10000+20000,86.7055
gfx936,no_quant,torch.bfloat16,3,192,4096,128,8,0,0,asm,10001+20000,119.6654
gfx936,no_quant,torch.bfloat16,4,192,4096,128,8,0,0,asm,10000+20000,145.4165
gfx936,no_quant,torch.bfloat16,5,192,4096,128,8,0,0,asm,10011+20000,167.3363
gfx936,no_quant,torch.bfloat16,6,192,4096,128,8,0,0,asm,10001+20000,198.0811
gfx936,no_quant,torch.bfloat16,7,192,4096,128,8,0,0,asm,10001+20000,225.6682
gfx936,no_quant,torch.bfloat16,8,192,4096,128,8,0,0,asm,10001+20000,253.22990000000001
gfx936,no_quant,torch.bfloat16,9,192,4096,128,8,0,0,asm,10001+20000,276.1772
gfx936,no_quant,torch.bfloat16,10,192,4096,128,8,0,0,asm,10001+20000,293.5413
gfx936,no_quant,torch.bfloat16,11,192,4096,128,8,0,0,asm,10005+20000,309.1789
gfx936,no_quant,torch.bfloat16,12,192,4096,128,8,0,0,asm,10001+20000,329.4145
gfx936,no_quant,torch.bfloat16,13,192,4096,128,8,0,0,asm,10001+20000,348.7321
gfx936,no_quant,torch.bfloat16,14,192,4096,128,8,0,0,asm,10001+20000,358.8541
gfx936,no_quant,torch.bfloat16,15,192,4096,128,8,0,0,asm,10001+20000,373.5655
gfx936,no_quant,torch.bfloat16,16,192,4096,128,8,0,0,asm,10001+20000,391.4516
gfx936,no_quant,torch.bfloat16,17,192,4096,128,8,0,0,asm,10001+20000,389.9161
gfx936,no_quant,torch.bfloat16,18,192,4096,128,8,0,0,asm,10001+20000,411.9958
gfx936,no_quant,torch.bfloat16,20,192,4096,128,8,0,0,asm,10000+20000,433.6464
gfx936,no_quant,torch.bfloat16,24,192,4096,128,8,0,0,asm,10000+20000,455.8606
gfx936,no_quant,torch.bfloat16,28,192,4096,128,8,0,0,asm,10000+20000,472.2339
gfx936,no_quant,torch.bfloat16,32,192,4096,128,8,0,0,asm,10000+20000,491.9726
gfx936,no_quant,torch.bfloat16,34,192,4096,128,8,0,0,asm,10001+20000,489.6287
gfx936,no_quant,torch.bfloat16,36,192,4096,128,8,0,0,asm,10000+20000,501.42629999999997
gfx936,no_quant,torch.bfloat16,40,192,4096,128,8,0,0,asm,10001+20000,519.8009
gfx936,no_quant,torch.bfloat16,44,192,4096,128,8,0,0,asm,10000+20000,541.9312
gfx936,no_quant,torch.bfloat16,48,192,4096,128,8,0,0,asm,10000+20000,553.4089
gfx936,no_quant,torch.bfloat16,56,192,4096,128,8,0,0,asm,10000+20000,556.8362
gfx936,no_quant,torch.bfloat16,64,192,4096,128,8,0,0,asm,10001+20000,573.7738
gfx936,no_quant,torch.bfloat16,68,192,4096,128,8,0,0,asm,10000+20000,563.0172
gfx936,no_quant,torch.bfloat16,72,192,4096,128,8,0,0,asm,10000+20000,566.1834
gfx936,no_quant,torch.bfloat16,80,192,4096,128,8,0,0,asm,10001+20000,566.7393
gfx936,no_quant,torch.bfloat16,88,192,4096,128,8,0,0,asm,10005+20000,603.7156
gfx936,no_quant,torch.bfloat16,96,192,4096,128,8,0,0,asm,10001+20000,577.3413
gfx936,no_quant,torch.bfloat16,104,192,4096,128,8,0,0,asm,10003+20000,584.0697
gfx936,no_quant,torch.bfloat16,112,192,4096,128,8,0,0,asm,10000+20000,585.4338
gfx936,no_quant,torch.bfloat16,128,192,4096,128,8,0,0,asm,10000+20000,590.5707
gfx936,no_quant,torch.bfloat16,144,192,4096,128,8,0,0,asm,10001+20000,598.9495
gfx936,no_quant,torch.bfloat16,160,192,4096,128,8,0,0,asm,10001+20000,603.9263
gfx936,no_quant,torch.bfloat16,192,192,4096,128,8,0,0,asm,10001+20000,626.6545
gfx936,no_quant,torch.bfloat16,224,192,4096,128,8,0,0,asm,10012+20000,626.0144
gfx936,no_quant,torch.bfloat16,256,192,4096,128,8,0,0,asm,10001+20000,640.6671
gfx936,no_quant,torch.bfloat16,320,192,4096,128,8,0,0,asm,11006+21001,662.0478
gfx936,no_quant,torch.bfloat16,384,192,4096,128,8,0,0,asm,12004+22001,676.5403
gfx936,no_quant,torch.bfloat16,448,192,4096,128,8,0,0,asm,12004+22001,688.4306
gfx936,no_quant,torch.bfloat16,512,192,4096,128,8,0,0,asm,12004+22001,708.1021
gfx936,no_quant,torch.bfloat16,576,192,4096,128,8,0,0,asm,12004+22001,720.3208
gfx936,no_quant,torch.bfloat16,640,192,4096,128,8,0,0,asm,12004+22001,736.7166
gfx936,no_quant,torch.bfloat16,704,192,4096,128,8,0,0,asm,12004+22001,743.7227
gfx936,no_quant,torch.bfloat16,768,192,4096,128,8,0,0,asm,12004+22001,763.0237
gfx936,no_quant,torch.bfloat16,832,192,4096,128,8,0,0,asm,12004+22001,779.8572
gfx936,no_quant,torch.bfloat16,896,192,4096,128,8,0,0,asm,12004+22001,797.6254
gfx936,no_quant,torch.bfloat16,960,192,4096,128,8,0,0,asm,12004+22001,836.2184
gfx936,no_quant,torch.bfloat16,1024,192,4096,128,8,0,0,asm,12001+22001,843.6542
gfx936,no_quant,torch.bfloat16,1152,192,4096,128,8,0,0,asm,13000+23001,884.7822
gfx936,no_quant,torch.bfloat16,1280,192,4096,128,8,0,0,asm,13000+23001,899.9314
gfx936,no_quant,torch.bfloat16,1408,192,4096,128,8,0,0,asm,13000+23001,922.5503
gfx936,no_quant,torch.bfloat16,1536,192,4096,128,8,0,0,asm,13000+23001,967.3328
gfx936,no_quant,torch.bfloat16,1664,192,4096,128,8,0,0,asm,13000+23001,974.6591
gfx936,no_quant,torch.bfloat16,1792,192,4096,128,8,0,0,asm,13001+23001,1055.4919
gfx936,no_quant,torch.bfloat16,1920,192,4096,128,8,0,0,asm,13000+23001,1065.8918
gfx936,no_quant,torch.bfloat16,2048,192,4096,128,8,0,0,asm,13000+23001,1143.0194
gfx936,no_quant,torch.bfloat16,2304,192,4096,128,8,0,0,asm,13001+23001,1326.3941
gfx936,no_quant,torch.bfloat16,2560,192,4096,128,8,0,0,asm,13001+23001,1377.5515
gfx936,no_quant,torch.bfloat16,2816,192,4096,128,8,0,0,asm,13000+23001,1432.0519
gfx936,no_quant,torch.bfloat16,3072,192,4096,128,8,0,0,asm,13001+23001,1466.1822
gfx936,no_quant,torch.bfloat16,3328,192,4096,128,8,0,0,asm,13001+23001,1510.1901
gfx936,no_quant,torch.bfloat16,3584,192,4096,128,8,0,0,asm,13001+23001,1557.4486
gfx936,no_quant,torch.bfloat16,3840,192,4096,128,8,0,0,asm,13001+23001,1618.8459
gfx936,no_quant,torch.bfloat16,4096,192,4096,128,8,0,0,asm,13001+23001,1753.8086
gfx936,no_quant,torch.bfloat16,4608,192,4096,128,8,0,0,asm,13001+23001,2039.8014
gfx936,no_quant,torch.bfloat16,5120,192,4096,128,8,0,0,asm,13001+23001,2126.6889
gfx936,no_quant,torch.bfloat16,5632,192,4096,128,8,0,0,asm,13001+23001,2223.7741
gfx936,no_quant,torch.bfloat16,6144,192,4096,128,8,0,0,asm,13001+23001,2430.6853
gfx936,no_quant,torch.bfloat16,6656,192,4096,128,8,0,0,asm,13001+23001,2734.7157
gfx936,no_quant,torch.bfloat16,7168,192,4096,128,8,0,0,asm,13001+23001,2803.3465
gfx936,no_quant,torch.bfloat16,7680,192,4096,128,8,0,0,asm,13001+23001,2972.8603
gfx936,no_quant,torch.bfloat16,8192,192,4096,128,8,0,0,asm,13001+23001,3185.2116
gfx936,no_quant,torch.bfloat16,10240,192,4096,128,8,0,0,asm,13001+23001,3868.2946
gfx936,no_quant,torch.bfloat16,12288,192,4096,128,8,0,0,asm,13001+23001,4562.308
gfx936,no_quant,torch.bfloat16,14336,192,4096,128,8,0,0,asm,13001+23001,5337.3815
gfx936,no_quant,torch.bfloat16,16384,192,4096,128,8,0,0,asm,13001+23001,6046.3087
gfx938,no_quant,torch.bfloat16,1,192,4096,128,8,0,0,asm,10002+20000,49.5607
gfx938,no_quant,torch.bfloat16,2,192,4096,128,8,0,0,asm,10000+20000,86.7055
gfx938,no_quant,torch.bfloat16,3,192,4096,128,8,0,0,asm,10001+20000,119.6654
gfx938,no_quant,torch.bfloat16,4,192,4096,128,8,0,0,asm,10000+20000,145.4165
gfx938,no_quant,torch.bfloat16,5,192,4096,128,8,0,0,asm,10011+20000,167.3363
gfx938,no_quant,torch.bfloat16,6,192,4096,128,8,0,0,asm,10001+20000,198.0811
gfx938,no_quant,torch.bfloat16,7,192,4096,128,8,0,0,asm,10001+20000,225.6682
gfx938,no_quant,torch.bfloat16,8,192,4096,128,8,0,0,asm,10001+20000,253.22990000000001
gfx938,no_quant,torch.bfloat16,9,192,4096,128,8,0,0,asm,10001+20000,276.1772
gfx938,no_quant,torch.bfloat16,10,192,4096,128,8,0,0,asm,10001+20000,293.5413
gfx938,no_quant,torch.bfloat16,11,192,4096,128,8,0,0,asm,10005+20000,309.1789
gfx938,no_quant,torch.bfloat16,12,192,4096,128,8,0,0,asm,10001+20000,329.4145
gfx938,no_quant,torch.bfloat16,13,192,4096,128,8,0,0,asm,10001+20000,348.7321
gfx938,no_quant,torch.bfloat16,14,192,4096,128,8,0,0,asm,10001+20000,358.8541
gfx938,no_quant,torch.bfloat16,15,192,4096,128,8,0,0,asm,10001+20000,373.5655
gfx938,no_quant,torch.bfloat16,16,192,4096,128,8,0,0,asm,10001+20000,391.4516
gfx938,no_quant,torch.bfloat16,17,192,4096,128,8,0,0,asm,10001+20000,389.9161
gfx938,no_quant,torch.bfloat16,18,192,4096,128,8,0,0,asm,10001+20000,411.9958
gfx938,no_quant,torch.bfloat16,20,192,4096,128,8,0,0,asm,10000+20000,433.6464
gfx938,no_quant,torch.bfloat16,24,192,4096,128,8,0,0,asm,10000+20000,455.8606
gfx938,no_quant,torch.bfloat16,28,192,4096,128,8,0,0,asm,10000+20000,472.2339
gfx938,no_quant,torch.bfloat16,32,192,4096,128,8,0,0,asm,10000+20000,491.9726
gfx938,no_quant,torch.bfloat16,34,192,4096,128,8,0,0,asm,10001+20000,489.6287
gfx938,no_quant,torch.bfloat16,36,192,4096,128,8,0,0,asm,10000+20000,501.42629999999997
gfx938,no_quant,torch.bfloat16,40,192,4096,128,8,0,0,asm,10001+20000,519.8009
gfx938,no_quant,torch.bfloat16,44,192,4096,128,8,0,0,asm,10000+20000,541.9312
gfx938,no_quant,torch.bfloat16,48,192,4096,128,8,0,0,asm,10000+20000,553.4089
gfx938,no_quant,torch.bfloat16,56,192,4096,128,8,0,0,asm,10000+20000,556.8362
gfx938,no_quant,torch.bfloat16,64,192,4096,128,8,0,0,asm,10001+20000,573.7738
gfx938,no_quant,torch.bfloat16,68,192,4096,128,8,0,0,asm,10000+20000,563.0172
gfx938,no_quant,torch.bfloat16,72,192,4096,128,8,0,0,asm,10000+20000,566.1834
gfx938,no_quant,torch.bfloat16,80,192,4096,128,8,0,0,asm,10001+20000,566.7393
gfx938,no_quant,torch.bfloat16,88,192,4096,128,8,0,0,asm,10005+20000,603.7156
gfx938,no_quant,torch.bfloat16,96,192,4096,128,8,0,0,asm,10001+20000,577.3413
gfx938,no_quant,torch.bfloat16,104,192,4096,128,8,0,0,asm,10003+20000,584.0697
gfx938,no_quant,torch.bfloat16,112,192,4096,128,8,0,0,asm,10000+20000,585.4338
gfx938,no_quant,torch.bfloat16,128,192,4096,128,8,0,0,asm,10000+20000,590.5707
gfx938,no_quant,torch.bfloat16,144,192,4096,128,8,0,0,asm,10001+20000,598.9495
gfx938,no_quant,torch.bfloat16,160,192,4096,128,8,0,0,asm,10001+20000,603.9263
gfx938,no_quant,torch.bfloat16,192,192,4096,128,8,0,0,asm,10001+20000,626.6545
gfx938,no_quant,torch.bfloat16,224,192,4096,128,8,0,0,asm,10012+20000,626.0144
gfx938,no_quant,torch.bfloat16,256,192,4096,128,8,0,0,asm,10001+20000,640.6671
gfx938,no_quant,torch.bfloat16,320,192,4096,128,8,0,0,asm,11006+21001,662.0478
gfx938,no_quant,torch.bfloat16,384,192,4096,128,8,0,0,asm,12004+22001,676.5403
gfx938,no_quant,torch.bfloat16,448,192,4096,128,8,0,0,asm,12004+22001,688.4306
gfx938,no_quant,torch.bfloat16,512,192,4096,128,8,0,0,asm,12004+22001,708.1021
gfx938,no_quant,torch.bfloat16,576,192,4096,128,8,0,0,asm,12004+22001,720.3208
gfx938,no_quant,torch.bfloat16,640,192,4096,128,8,0,0,asm,12004+22001,736.7166
gfx938,no_quant,torch.bfloat16,704,192,4096,128,8,0,0,asm,12004+22001,743.7227
gfx938,no_quant,torch.bfloat16,768,192,4096,128,8,0,0,asm,12004+22001,763.0237
gfx938,no_quant,torch.bfloat16,832,192,4096,128,8,0,0,asm,12004+22001,779.8572
gfx938,no_quant,torch.bfloat16,896,192,4096,128,8,0,0,asm,12004+22001,797.6254
gfx938,no_quant,torch.bfloat16,960,192,4096,128,8,0,0,asm,12004+22001,836.2184
gfx938,no_quant,torch.bfloat16,1024,192,4096,128,8,0,0,asm,12001+22001,843.6542
gfx938,no_quant,torch.bfloat16,1152,192,4096,128,8,0,0,asm,13000+23001,884.7822
gfx938,no_quant,torch.bfloat16,1280,192,4096,128,8,0,0,asm,13000+23001,899.9314
gfx938,no_quant,torch.bfloat16,1408,192,4096,128,8,0,0,asm,13000+23001,922.5503
gfx938,no_quant,torch.bfloat16,1536,192,4096,128,8,0,0,asm,13000+23001,967.3328
gfx938,no_quant,torch.bfloat16,1664,192,4096,128,8,0,0,asm,13000+23001,974.6591
gfx938,no_quant,torch.bfloat16,1792,192,4096,128,8,0,0,asm,13001+23001,1055.4919
gfx938,no_quant,torch.bfloat16,1920,192,4096,128,8,0,0,asm,13000+23001,1065.8918
gfx938,no_quant,torch.bfloat16,2048,192,4096,128,8,0,0,asm,13000+23001,1143.0194
gfx938,no_quant,torch.bfloat16,2304,192,4096,128,8,0,0,asm,13001+23001,1326.3941
gfx938,no_quant,torch.bfloat16,2560,192,4096,128,8,0,0,asm,13001+23001,1377.5515
gfx938,no_quant,torch.bfloat16,2816,192,4096,128,8,0,0,asm,13000+23001,1432.0519
gfx938,no_quant,torch.bfloat16,3072,192,4096,128,8,0,0,asm,13001+23001,1466.1822
gfx938,no_quant,torch.bfloat16,3328,192,4096,128,8,0,0,asm,13001+23001,1510.1901
gfx938,no_quant,torch.bfloat16,3584,192,4096,128,8,0,0,asm,13001+23001,1557.4486
gfx938,no_quant,torch.bfloat16,3840,192,4096,128,8,0,0,asm,13001+23001,1618.8459
gfx938,no_quant,torch.bfloat16,4096,192,4096,128,8,0,0,asm,13001+23001,1753.8086
gfx938,no_quant,torch.bfloat16,4608,192,4096,128,8,0,0,asm,13001+23001,2039.8014
gfx938,no_quant,torch.bfloat16,5120,192,4096,128,8,0,0,asm,13001+23001,2126.6889
gfx938,no_quant,torch.bfloat16,5632,192,4096,128,8,0,0,asm,13001+23001,2223.7741
gfx938,no_quant,torch.bfloat16,6144,192,4096,128,8,0,0,asm,13001+23001,2430.6853
gfx938,no_quant,torch.bfloat16,6656,192,4096,128,8,0,0,asm,13001+23001,2734.7157
gfx938,no_quant,torch.bfloat16,7168,192,4096,128,8,0,0,asm,13001+23001,2803.3465
gfx938,no_quant,torch.bfloat16,7680,192,4096,128,8,0,0,asm,13001+23001,2972.8603
gfx938,no_quant,torch.bfloat16,8192,192,4096,128,8,0,0,asm,13001+23001,3185.2116
gfx938,no_quant,torch.bfloat16,10240,192,4096,128,8,0,0,asm,13001+23001,3868.2946
gfx938,no_quant,torch.bfloat16,12288,192,4096,128,8,0,0,asm,13001+23001,4562.308
gfx938,no_quant,torch.bfloat16,14336,192,4096,128,8,0,0,asm,13001+23001,5337.3815
gfx938,no_quant,torch.bfloat16,16384,192,4096,128,8,0,0,asm,13001+23001,6046.3087
gfx938,no_quant,torch.bfloat16,1,384,4096,128,8,0,0,asm,10008+20000,88.778
gfx938,no_quant,torch.bfloat16,2,384,4096,128,8,0,0,asm,10012+20000,145.6453
gfx938,no_quant,torch.bfloat16,3,384,4096,128,8,0,0,asm,10002+20000,226.1335
gfx938,no_quant,torch.bfloat16,4,384,4096,128,8,0,0,asm,10008+20000,262.7649
gfx938,no_quant,torch.bfloat16,5,384,4096,128,8,0,0,asm,10011+20000,309.5268
gfx938,no_quant,torch.bfloat16,6,384,4096,128,8,0,0,asm,10002+20000,379.8004
gfx938,no_quant,torch.bfloat16,7,384,4096,128,8,0,0,asm,10008+20000,421.0213
gfx938,no_quant,torch.bfloat16,8,384,4096,128,8,0,0,asm,10011+20000,473.8716
gfx938,no_quant,torch.bfloat16,9,384,4096,128,8,0,0,asm,10011+20000,501.8547
gfx938,no_quant,torch.bfloat16,10,384,4096,128,8,0,0,asm,10002+20000,554.2504
gfx938,no_quant,torch.bfloat16,11,384,4096,128,8,0,0,asm,10002+20000,569.8967
gfx938,no_quant,torch.bfloat16,12,384,4096,128,8,0,0,asm,10008+20000,607.4544
gfx938,no_quant,torch.bfloat16,13,384,4096,128,8,0,0,asm,10002+20000,647.4375
gfx938,no_quant,torch.bfloat16,14,384,4096,128,8,0,0,asm,10012+20000,668.9448
gfx938,no_quant,torch.bfloat16,15,384,4096,128,8,0,0,asm,10002+20000,691.8079
gfx938,no_quant,torch.bfloat16,16,384,4096,128,8,0,0,asm,10002+20000,721.2395
gfx938,no_quant,torch.bfloat16,17,384,4096,128,8,0,0,asm,10002+20000,722.8702
gfx938,no_quant,torch.bfloat16,18,384,4096,128,8,0,0,asm,10002+20000,759.9647
gfx938,no_quant,torch.bfloat16,20,384,4096,128,8,0,0,asm,10002+20000,807.611
gfx938,no_quant,torch.bfloat16,24,384,4096,128,8,0,0,asm,10002+20000,838.7015
gfx938,no_quant,torch.bfloat16,28,384,4096,128,8,0,0,asm,10002+20000,864.4391
gfx938,no_quant,torch.bfloat16,32,384,4096,128,8,0,0,asm,10002+20000,897.0201
gfx938,no_quant,torch.bfloat16,34,384,4096,128,8,0,0,asm,10002+20000,897.0423
gfx938,no_quant,torch.bfloat16,36,384,4096,128,8,0,0,asm,10002+20000,925.1771
gfx938,no_quant,torch.bfloat16,40,384,4096,128,8,0,0,asm,10002+20000,946.8443
gfx938,no_quant,torch.bfloat16,44,384,4096,128,8,0,0,asm,10002+20000,982.0526
gfx938,no_quant,torch.bfloat16,48,384,4096,128,8,0,0,asm,10002+20000,1004.0483
gfx938,no_quant,torch.bfloat16,56,384,4096,128,8,0,0,asm,10002+20000,1008.3682999999999
gfx938,no_quant,torch.bfloat16,64,384,4096,128,8,0,0,asm,10002+20000,1036.6745
gfx938,no_quant,torch.bfloat16,68,384,4096,128,8,0,0,asm,10002+20000,1009.5556000000001
gfx938,no_quant,torch.bfloat16,72,384,4096,128,8,0,0,asm,10002+20000,1017.5387000000001
gfx938,no_quant,torch.bfloat16,80,384,4096,128,8,0,0,asm,10002+20000,1016.1324000000001
gfx938,no_quant,torch.bfloat16,88,384,4096,128,8,0,0,asm,10002+20000,1022.3051
gfx938,no_quant,torch.bfloat16,96,384,4096,128,8,0,0,asm,10002+20000,1028.1575
gfx938,no_quant,torch.bfloat16,104,384,4096,128,8,0,0,asm,10002+20000,1029.3365
gfx938,no_quant,torch.bfloat16,112,384,4096,128,8,0,0,asm,10002+20000,1030.6838
gfx938,no_quant,torch.bfloat16,128,384,4096,128,8,0,0,asm,10002+20000,1038.3891
gfx938,no_quant,torch.bfloat16,144,384,4096,128,8,0,0,asm,10002+20000,1049.4039
gfx938,no_quant,torch.bfloat16,160,384,4096,128,8,0,0,asm,10002+20000,1057.5723
gfx938,no_quant,torch.bfloat16,192,384,4096,128,8,0,0,asm,12001+22001,1096.2332
gfx938,no_quant,torch.bfloat16,224,384,4096,128,8,0,0,asm,12001+22001,1105.1679
gfx938,no_quant,torch.bfloat16,256,384,4096,128,8,0,0,asm,12001+22001,1114.4311
gfx938,no_quant,torch.bfloat16,320,384,4096,128,8,0,0,asm,12001+22001,1129.5048
gfx938,no_quant,torch.bfloat16,384,384,4096,128,8,0,0,asm,12001+22001,1159.4921
gfx938,no_quant,torch.bfloat16,448,384,4096,128,8,0,0,asm,12001+22001,1161.1932
gfx938,no_quant,torch.bfloat16,512,384,4096,128,8,0,0,asm,12001+22001,1180.1742
gfx938,no_quant,torch.bfloat16,576,384,4096,128,8,0,0,asm,12001+22001,1196.2163
gfx938,no_quant,torch.bfloat16,640,384,4096,128,8,0,0,asm,12001+22001,1219.7446
gfx938,no_quant,torch.bfloat16,704,384,4096,128,8,0,0,asm,12001+22001,1224.0478
gfx938,no_quant,torch.bfloat16,768,384,4096,128,8,0,0,asm,12001+22001,1238.1277
gfx938,no_quant,torch.bfloat16,832,384,4096,128,8,0,0,asm,12001+22001,1252.9572
gfx938,no_quant,torch.bfloat16,896,384,4096,128,8,0,0,asm,12001+22001,1309.0665
gfx938,no_quant,torch.bfloat16,960,384,4096,128,8,0,0,asm,13001+23001,1374.0515
gfx938,no_quant,torch.bfloat16,1024,384,4096,128,8,0,0,asm,13001+23001,1385.7821
gfx938,no_quant,torch.bfloat16,1152,384,4096,128,8,0,0,asm,13001+23001,1423.6346
gfx938,no_quant,torch.bfloat16,1280,384,4096,128,8,0,0,asm,13001+23001,1419.9631
gfx938,no_quant,torch.bfloat16,1408,384,4096,128,8,0,0,asm,13001+23001,1439.9123
gfx938,no_quant,torch.bfloat16,1536,384,4096,128,8,0,0,asm,13001+23001,1467.6091
gfx938,no_quant,torch.bfloat16,1664,384,4096,128,8,0,0,asm,13001+23001,1592.1729
gfx938,no_quant,torch.bfloat16,1792,384,4096,128,8,0,0,asm,13001+23001,1631.3979
gfx938,no_quant,torch.bfloat16,1920,384,4096,128,8,0,0,asm,13001+23001,1820.3655
gfx938,no_quant,torch.bfloat16,2048,384,4096,128,8,0,0,asm,13001+23001,2047.0343
gfx938,no_quant,torch.bfloat16,2304,384,4096,128,8,0,0,asm,13001+23001,2625.8116
gfx938,no_quant,torch.bfloat16,2560,384,4096,128,8,0,0,asm,13001+23001,2679.9506
gfx938,no_quant,torch.bfloat16,2816,384,4096,128,8,0,0,asm,13001+23001,2715.7403
gfx938,no_quant,torch.bfloat16,3072,384,4096,128,8,0,0,asm,13001+23001,2745.2056
gfx938,no_quant,torch.bfloat16,3328,384,4096,128,8,0,0,asm,13001+23001,2781.8456
gfx938,no_quant,torch.bfloat16,3584,384,4096,128,8,0,0,asm,13001+23001,2925.4747
gfx938,no_quant,torch.bfloat16,3840,384,4096,128,8,0,0,asm,13001+23001,3025.5921
gfx938,no_quant,torch.bfloat16,4096,384,4096,128,8,0,0,asm,13001+23001,3433.3634
gfx938,no_quant,torch.bfloat16,4608,384,4096,128,8,0,0,asm,13001+23001,4062.0351
gfx938,no_quant,torch.bfloat16,5120,384,4096,128,8,0,0,asm,13001+23001,4116.7715
gfx938,no_quant,torch.bfloat16,5632,384,4096,128,8,0,0,asm,13001+23001,4327.2296
gfx938,no_quant,torch.bfloat16,6144,384,4096,128,8,0,0,asm,13001+23001,4917.9061
gfx938,no_quant,torch.bfloat16,6656,384,4096,128,8,0,0,asm,13001+23001,5421.0124
gfx938,no_quant,torch.bfloat16,7168,384,4096,128,8,0,0,asm,13001+23001,5534.0483
gfx938,no_quant,torch.bfloat16,7680,384,4096,128,8,0,0,asm,13001+23001,5698.4187
gfx938,no_quant,torch.bfloat16,8192,384,4096,128,8,0,0,asm,13001+23001,6196.8255
gfx938,no_quant,torch.bfloat16,10240,384,4096,128,8,0,0,asm,13001+23001,7595.145
gfx938,no_quant,torch.bfloat16,12288,384,4096,128,8,0,0,asm,13001+23001,8934.1889
gfx938,no_quant,torch.bfloat16,14336,384,4096,128,8,0,0,asm,13001+23001,10329.2311
gfx938,no_quant,torch.bfloat16,16384,384,4096,128,8,0,0,asm,13001+23001,11702.0355
gfx938,no_quant,torch.bfloat16,17408,384,4096,128,8,0,0,asm,13001+23001,12483.0714
gfx938,no_quant,torch.bfloat16,24576,384,4096,128,8,0,0,asm,13001+23001,17223.7804
gfx938,no_quant,torch.bfloat16,32768,384,4096,128,8,0,0,asm,13001+23001,22719.4254
gfx938,no_quant,torch.bfloat16,40960,384,4096,128,8,0,0,asm,13001+23001,28232.8175
gfx938,no_quant,torch.bfloat16,49152,384,4096,128,8,0,0,asm,13001+23001,33760.5851
gfx938,no_quant,torch.bfloat16,57344,384,4096,128,8,0,0,asm,13001+23001,39304.2245
gfx938,no_quant,torch.bfloat16,65536,384,4096,128,8,0,0,asm,13001+23001,44786.0225
gfx938,no_quant,torch.bfloat16,65536,384,4096,128,8,0,0,asm,13001+23001,44776.8645
gfx936,no_quant,torch.bfloat16,1,384,4096,128,8,0,0,asm,10008+20000,88.778
gfx936,no_quant,torch.bfloat16,2,384,4096,128,8,0,0,asm,10012+20000,145.6453
gfx936,no_quant,torch.bfloat16,3,384,4096,128,8,0,0,asm,10002+20000,226.1335
gfx936,no_quant,torch.bfloat16,4,384,4096,128,8,0,0,asm,10008+20000,262.7649
gfx936,no_quant,torch.bfloat16,5,384,4096,128,8,0,0,asm,10011+20000,309.5268
gfx936,no_quant,torch.bfloat16,6,384,4096,128,8,0,0,asm,10002+20000,379.8004
gfx936,no_quant,torch.bfloat16,7,384,4096,128,8,0,0,asm,10008+20000,421.0213
gfx936,no_quant,torch.bfloat16,8,384,4096,128,8,0,0,asm,10011+20000,473.8716
gfx936,no_quant,torch.bfloat16,9,384,4096,128,8,0,0,asm,10011+20000,501.8547
gfx936,no_quant,torch.bfloat16,10,384,4096,128,8,0,0,asm,10002+20000,554.2504
gfx936,no_quant,torch.bfloat16,11,384,4096,128,8,0,0,asm,10002+20000,569.8967
gfx936,no_quant,torch.bfloat16,12,384,4096,128,8,0,0,asm,10008+20000,607.4544
gfx936,no_quant,torch.bfloat16,13,384,4096,128,8,0,0,asm,10002+20000,647.4375
gfx936,no_quant,torch.bfloat16,14,384,4096,128,8,0,0,asm,10012+20000,668.9448
gfx936,no_quant,torch.bfloat16,15,384,4096,128,8,0,0,asm,10002+20000,691.8079
gfx936,no_quant,torch.bfloat16,16,384,4096,128,8,0,0,asm,10002+20000,721.2395
gfx936,no_quant,torch.bfloat16,17,384,4096,128,8,0,0,asm,10002+20000,722.8702
gfx936,no_quant,torch.bfloat16,18,384,4096,128,8,0,0,asm,10002+20000,759.9647
gfx936,no_quant,torch.bfloat16,20,384,4096,128,8,0,0,asm,10002+20000,807.611
gfx936,no_quant,torch.bfloat16,24,384,4096,128,8,0,0,asm,10002+20000,838.7015
gfx936,no_quant,torch.bfloat16,28,384,4096,128,8,0,0,asm,10002+20000,864.4391
gfx936,no_quant,torch.bfloat16,32,384,4096,128,8,0,0,asm,10002+20000,897.0201
gfx936,no_quant,torch.bfloat16,34,384,4096,128,8,0,0,asm,10002+20000,897.0423
gfx936,no_quant,torch.bfloat16,36,384,4096,128,8,0,0,asm,10002+20000,925.1771
gfx936,no_quant,torch.bfloat16,40,384,4096,128,8,0,0,asm,10002+20000,946.8443
gfx936,no_quant,torch.bfloat16,44,384,4096,128,8,0,0,asm,10002+20000,982.0526
gfx936,no_quant,torch.bfloat16,48,384,4096,128,8,0,0,asm,10002+20000,1004.0483
gfx936,no_quant,torch.bfloat16,56,384,4096,128,8,0,0,asm,10002+20000,1008.3682999999999
gfx936,no_quant,torch.bfloat16,64,384,4096,128,8,0,0,asm,10002+20000,1036.6745
gfx936,no_quant,torch.bfloat16,68,384,4096,128,8,0,0,asm,10002+20000,1009.5556000000001
gfx936,no_quant,torch.bfloat16,72,384,4096,128,8,0,0,asm,10002+20000,1017.5387000000001
gfx936,no_quant,torch.bfloat16,80,384,4096,128,8,0,0,asm,10002+20000,1016.1324000000001
gfx936,no_quant,torch.bfloat16,88,384,4096,128,8,0,0,asm,10002+20000,1022.3051
gfx936,no_quant,torch.bfloat16,96,384,4096,128,8,0,0,asm,10002+20000,1028.1575
gfx936,no_quant,torch.bfloat16,104,384,4096,128,8,0,0,asm,10002+20000,1029.3365
gfx936,no_quant,torch.bfloat16,112,384,4096,128,8,0,0,asm,10002+20000,1030.6838
gfx936,no_quant,torch.bfloat16,128,384,4096,128,8,0,0,asm,10002+20000,1038.3891
gfx936,no_quant,torch.bfloat16,144,384,4096,128,8,0,0,asm,10002+20000,1049.4039
gfx936,no_quant,torch.bfloat16,160,384,4096,128,8,0,0,asm,10002+20000,1057.5723
gfx936,no_quant,torch.bfloat16,192,384,4096,128,8,0,0,asm,12001+22001,1096.2332
gfx936,no_quant,torch.bfloat16,224,384,4096,128,8,0,0,asm,12001+22001,1105.1679
gfx936,no_quant,torch.bfloat16,256,384,4096,128,8,0,0,asm,12001+22001,1114.4311
gfx936,no_quant,torch.bfloat16,320,384,4096,128,8,0,0,asm,12001+22001,1129.5048
gfx936,no_quant,torch.bfloat16,384,384,4096,128,8,0,0,asm,12001+22001,1159.4921
gfx936,no_quant,torch.bfloat16,448,384,4096,128,8,0,0,asm,12001+22001,1161.1932
gfx936,no_quant,torch.bfloat16,512,384,4096,128,8,0,0,asm,12001+22001,1180.1742
gfx936,no_quant,torch.bfloat16,576,384,4096,128,8,0,0,asm,12001+22001,1196.2163
gfx936,no_quant,torch.bfloat16,640,384,4096,128,8,0,0,asm,12001+22001,1219.7446
gfx936,no_quant,torch.bfloat16,704,384,4096,128,8,0,0,asm,12001+22001,1224.0478
gfx936,no_quant,torch.bfloat16,768,384,4096,128,8,0,0,asm,12001+22001,1238.1277
gfx936,no_quant,torch.bfloat16,832,384,4096,128,8,0,0,asm,12001+22001,1252.9572
gfx936,no_quant,torch.bfloat16,896,384,4096,128,8,0,0,asm,12001+22001,1309.0665
gfx936,no_quant,torch.bfloat16,960,384,4096,128,8,0,0,asm,13001+23001,1374.0515
gfx936,no_quant,torch.bfloat16,1024,384,4096,128,8,0,0,asm,13001+23001,1385.7821
gfx936,no_quant,torch.bfloat16,1152,384,4096,128,8,0,0,asm,13001+23001,1423.6346
gfx936,no_quant,torch.bfloat16,1280,384,4096,128,8,0,0,asm,13001+23001,1419.9631
gfx936,no_quant,torch.bfloat16,1408,384,4096,128,8,0,0,asm,13001+23001,1439.9123
gfx936,no_quant,torch.bfloat16,1536,384,4096,128,8,0,0,asm,13001+23001,1467.6091
gfx936,no_quant,torch.bfloat16,1664,384,4096,128,8,0,0,asm,13001+23001,1592.1729
gfx936,no_quant,torch.bfloat16,1792,384,4096,128,8,0,0,asm,13001+23001,1631.3979
gfx936,no_quant,torch.bfloat16,1920,384,4096,128,8,0,0,asm,13001+23001,1820.3655
gfx936,no_quant,torch.bfloat16,2048,384,4096,128,8,0,0,asm,13001+23001,2047.0343
gfx936,no_quant,torch.bfloat16,2304,384,4096,128,8,0,0,asm,13001+23001,2625.8116
gfx936,no_quant,torch.bfloat16,2560,384,4096,128,8,0,0,asm,13001+23001,2679.9506
gfx936,no_quant,torch.bfloat16,2816,384,4096,128,8,0,0,asm,13001+23001,2715.7403
gfx936,no_quant,torch.bfloat16,3072,384,4096,128,8,0,0,asm,13001+23001,2745.2056
gfx936,no_quant,torch.bfloat16,3328,384,4096,128,8,0,0,asm,13001+23001,2781.8456
gfx936,no_quant,torch.bfloat16,3584,384,4096,128,8,0,0,asm,13001+23001,2925.4747
gfx936,no_quant,torch.bfloat16,3840,384,4096,128,8,0,0,asm,13001+23001,3025.5921
gfx936,no_quant,torch.bfloat16,4096,384,4096,128,8,0,0,asm,13001+23001,3433.3634
gfx936,no_quant,torch.bfloat16,4608,384,4096,128,8,0,0,asm,13001+23001,4062.0351
gfx936,no_quant,torch.bfloat16,5120,384,4096,128,8,0,0,asm,13001+23001,4116.7715
gfx936,no_quant,torch.bfloat16,5632,384,4096,128,8,0,0,asm,13001+23001,4327.2296
gfx936,no_quant,torch.bfloat16,6144,384,4096,128,8,0,0,asm,13001+23001,4917.9061
gfx936,no_quant,torch.bfloat16,6656,384,4096,128,8,0,0,asm,13001+23001,5421.0124
gfx936,no_quant,torch.bfloat16,7168,384,4096,128,8,0,0,asm,13001+23001,5534.0483
gfx936,no_quant,torch.bfloat16,7680,384,4096,128,8,0,0,asm,13001+23001,5698.4187
gfx936,no_quant,torch.bfloat16,8192,384,4096,128,8,0,0,asm,13001+23001,6196.8255
gfx936,no_quant,torch.bfloat16,10240,384,4096,128,8,0,0,asm,13001+23001,7595.145
gfx936,no_quant,torch.bfloat16,12288,384,4096,128,8,0,0,asm,13001+23001,8934.1889
gfx936,no_quant,torch.bfloat16,14336,384,4096,128,8,0,0,asm,13001+23001,10329.2311
gfx936,no_quant,torch.bfloat16,16384,384,4096,128,8,0,0,asm,13001+23001,11702.0355
gfx936,no_quant,torch.bfloat16,17408,384,4096,128,8,0,0,asm,13001+23001,12483.0714
gfx936,no_quant,torch.bfloat16,24576,384,4096,128,8,0,0,asm,13001+23001,17223.7804
gfx936,no_quant,torch.bfloat16,32768,384,4096,128,8,0,0,asm,13001+23001,22719.4254
gfx936,no_quant,torch.bfloat16,40960,384,4096,128,8,0,0,asm,13001+23001,28232.8175
gfx936,no_quant,torch.bfloat16,49152,384,4096,128,8,0,0,asm,13001+23001,33760.5851
gfx936,no_quant,torch.bfloat16,57344,384,4096,128,8,0,0,asm,13001+23001,39304.2245
gfx936,no_quant,torch.bfloat16,65536,384,4096,128,8,0,0,asm,13001+23001,44786.0225
gfx936,no_quant,torch.bfloat16,65536,384,4096,128,8,0,0,asm,13001+23001,44776.8645
gfx936,no_quant,torch.bfloat16,1,896,1280,64,6,0,0,asm,10011+20000,53.2828
gfx936,no_quant,torch.bfloat16,8,896,1280,64,6,0,0,asm,10002+20000,229.8033
gfx936,no_quant,torch.bfloat16,16,896,1280,64,6,0,0,asm,10002+20001,343.8064
gfx936,no_quant,torch.bfloat16,24,896,1280,64,6,0,0,asm,10002+20001,374.6831
gfx936,no_quant,torch.bfloat16,32,896,1280,64,6,0,0,asm,10002+20001,391.7386
gfx936,no_quant,torch.bfloat16,48,896,1280,64,6,0,0,asm,10002+20001,410.7332
gfx936,no_quant,torch.bfloat16,64,896,1280,64,6,0,0,asm,10002+20001,423.3508
gfx936,no_quant,torch.bfloat16,96,896,1280,64,6,0,0,asm,10007+20001,432.4509
gfx936,no_quant,torch.bfloat16,128,896,1280,64,6,0,0,asm,10002+20001,458.4495
gfx936,no_quant,torch.bfloat16,256,896,1280,64,6,0,0,asm,11006+20000,479.9704
gfx936,no_quant,torch.bfloat16,512,896,1280,64,6,0,0,asm,13001+23002,520.5426
gfx936,no_quant,torch.bfloat16,1024,896,1280,64,6,0,0,asm,13001+23002,572.3737
gfx936,no_quant,torch.bfloat16,2048,896,1280,64,6,0,0,asm,13001+23001,789.1126
gfx936,no_quant,torch.bfloat16,4096,896,1280,64,6,0,0,asm,13001+23001,1279.3058
gfx936,no_quant,torch.bfloat16,8192,896,1280,64,6,0,0,asm,13001+23001,2321.6618
gfx936,no_quant,torch.bfloat16,1,448,1280,64,6,0,0,asm,10009+20000,36.8876
gfx936,no_quant,torch.bfloat16,8,448,1280,64,6,0,0,asm,10010+20000,123.3791
gfx936,no_quant,torch.bfloat16,16,448,1280,64,6,0,0,asm,10002+20001,181.5932
gfx936,no_quant,torch.bfloat16,24,448,1280,64,6,0,0,asm,10002+20002,201.2532
gfx936,no_quant,torch.bfloat16,32,448,1280,64,6,0,0,asm,10002+20001,206.5698
gfx936,no_quant,torch.bfloat16,48,448,1280,64,6,0,0,asm,10002+20001,219.3329
gfx936,no_quant,torch.bfloat16,64,448,1280,64,6,0,0,asm,10002+20001,226.2244
gfx936,no_quant,torch.bfloat16,96,448,1280,64,6,0,0,asm,10002+20001,228.6381
gfx936,no_quant,torch.bfloat16,128,448,1280,64,6,0,0,asm,10002+20000,250.29160000000002
gfx936,no_quant,torch.bfloat16,256,448,1280,64,6,0,0,asm,11006+20000,257.0169
gfx936,no_quant,torch.bfloat16,512,448,1280,64,6,0,0,asm,12004+21001,293.6818
gfx936,no_quant,torch.bfloat16,1024,448,1280,64,6,0,0,asm,13000+22001,347.8455
gfx936,no_quant,torch.bfloat16,2048,448,1280,64,6,0,0,asm,13001+23001,530.3532
gfx936,no_quant,torch.bfloat16,4096,448,1280,64,6,0,0,asm,13001+23001,832.2113
gfx936,no_quant,torch.bfloat16,8192,448,1280,64,6,0,0,asm,13001+23001,1487.3629
......@@ -121,6 +121,57 @@ gfx938,no_quant,torch.float16,32768,352,4096,129,9,0,0,asm,13001+23001,16397.301
gfx938,no_quant,torch.float16,40960,352,4096,129,9,0,0,asm,13001+23001,20398.5288
gfx938,no_quant,torch.float16,49152,352,4096,129,9,0,0,asm,13001+23001,24396.6972
gfx938,no_quant,torch.float16,65536,352,4096,129,9,0,0,asm,13001+23001,32435.0655
gfx938,no_quant,torch.bfloat16,1,192,2048,128,8,0,0,asm,10007+20000,39.2876
gfx938,no_quant,torch.bfloat16,2,192,2048,128,8,0,0,asm,10009+20000,51.8687
gfx938,no_quant,torch.bfloat16,4,192,2048,128,8,0,0,asm,10006+20000,73.8475
gfx938,no_quant,torch.bfloat16,8,192,2048,128,8,0,0,asm,10006+20000,107.3038
gfx938,no_quant,torch.bfloat16,16,192,2048,128,8,0,0,asm,10007+20000,149.3254
gfx938,no_quant,torch.bfloat16,32,192,2048,128,8,0,0,asm,10006+20000,182.4199
gfx938,no_quant,torch.bfloat16,64,192,2048,128,8,0,0,asm,10006+20000,213.6452
gfx938,no_quant,torch.bfloat16,128,192,2048,128,8,0,0,asm,10006+20000,205.5664
gfx938,no_quant,torch.bfloat16,256,192,2048,128,8,0,0,asm,11004+21001,225.3643
gfx938,no_quant,torch.bfloat16,512,192,2048,128,8,0,0,asm,11004+21001,268.9685
gfx938,no_quant,torch.bfloat16,1024,192,2048,128,8,0,0,asm,12002+22001,373.4482
gfx938,no_quant,torch.bfloat16,2048,192,2048,128,8,0,0,asm,12001+22001,544.0333
gfx938,no_quant,torch.bfloat16,4096,192,2048,128,8,0,0,asm,13001+23001,859.873
gfx938,no_quant,torch.bfloat16,8192,192,2048,128,8,0,0,asm,13001+23001,1515.4337
gfx938,no_quant,torch.bfloat16,16384,192,2048,128,8,0,0,asm,13001+23001,2881.8408
gfx938,no_quant,torch.bfloat16,32768,192,2048,128,8,0,0,asm,13001+23001,5550.2244
gfx938,no_quant,torch.bfloat16,65536,192,2048,128,8,0,0,asm,13001+23001,10944.1702
gfx938,no_quant,torch.bfloat16,1,384,2048,128,8,0,0,asm,10001+20000,53.056
gfx938,no_quant,torch.bfloat16,2,384,2048,128,8,0,0,asm,10006+20000,77.3086
gfx938,no_quant,torch.bfloat16,4,384,2048,128,8,0,0,asm,10006+20000,112.6348
gfx938,no_quant,torch.bfloat16,8,384,2048,128,8,0,0,asm,10006+20000,177.2747
gfx938,no_quant,torch.bfloat16,16,384,2048,128,8,0,0,asm,10006+20000,260.6267
gfx938,no_quant,torch.bfloat16,32,384,2048,128,8,0,0,asm,10006+20000,320.2976
gfx938,no_quant,torch.bfloat16,64,384,2048,128,8,0,0,asm,10006+20000,367.4922
gfx938,no_quant,torch.bfloat16,128,384,2048,128,8,0,0,asm,10009+20000,364.0352
gfx938,no_quant,torch.bfloat16,256,384,2048,128,8,0,0,asm,11004+21001,391.2504
gfx938,no_quant,torch.bfloat16,512,384,2048,128,8,0,0,asm,12000+22001,455.2254
gfx938,no_quant,torch.bfloat16,1024,384,2048,128,8,0,0,asm,12001+22001,542.8131
gfx938,no_quant,torch.bfloat16,2048,384,2048,128,8,0,0,asm,13001+23001,709.3484
gfx938,no_quant,torch.bfloat16,4096,384,2048,128,8,0,0,asm,13001+23001,1144.2526
gfx938,no_quant,torch.bfloat16,8192,384,2048,128,8,0,0,asm,13001+23001,1982.3018
gfx938,no_quant,torch.bfloat16,16384,384,2048,128,8,0,0,asm,13001+23001,3922.8848
gfx938,no_quant,torch.bfloat16,32768,384,2048,128,8,0,0,asm,13001+23001,7601.1435
gfx938,no_quant,torch.bfloat16,65536,384,2048,128,8,0,0,asm,13001+23001,15053.8397
gfx938,no_quant,torch.bfloat16,1,768,2048,128,8,0,0,asm,10006+20000,75.2789
gfx938,no_quant,torch.bfloat16,2,768,2048,128,8,0,0,asm,10006+20000,119.599
gfx938,no_quant,torch.bfloat16,4,768,2048,128,8,0,0,asm,10007+20000,189.241
gfx938,no_quant,torch.bfloat16,8,768,2048,128,8,0,0,asm,10006+20000,311.8679
gfx938,no_quant,torch.bfloat16,16,768,2048,128,8,0,0,asm,10008+20000,465.2827
gfx938,no_quant,torch.bfloat16,32,768,2048,128,8,0,0,asm,10008+20000,574.3358
gfx938,no_quant,torch.bfloat16,64,768,2048,128,8,0,0,asm,10008+20000,659.7834
gfx938,no_quant,torch.bfloat16,128,768,2048,128,8,0,0,asm,10008+20000,672.0162
gfx938,no_quant,torch.bfloat16,256,768,2048,128,8,0,0,asm,11002+21001,716.3866
gfx938,no_quant,torch.bfloat16,512,768,2048,128,8,0,0,asm,12005+22001,802.6013
gfx938,no_quant,torch.bfloat16,1024,768,2048,128,8,0,0,asm,13001+23001,945.1779
gfx938,no_quant,torch.bfloat16,2048,768,2048,128,8,0,0,asm,13001+23001,1243.9816
gfx938,no_quant,torch.bfloat16,4096,768,2048,128,8,0,0,asm,13001+23001,1989.4641
gfx938,no_quant,torch.bfloat16,8192,768,2048,128,8,0,0,asm,13001+23001,3554.2789
gfx938,no_quant,torch.bfloat16,16384,768,2048,128,8,0,0,asm,13001+23001,6779.7759
gfx938,no_quant,torch.bfloat16,32768,768,2048,128,8,0,0,asm,13001+23001,13203.9373
gfx938,no_quant,torch.bfloat16,65536,768,2048,128,8,0,0,asm,13001+23001,26121.7552
gfx936,no_quant,torch.float16,1,256,3072,256,8,0,0,asm,10006+20000,56.4327
gfx936,no_quant,torch.float16,2,256,3072,256,8,0,0,asm,10006+20000,85.2664
gfx936,no_quant,torch.float16,4,256,3072,256,8,0,0,asm,10004+20000,148.02
......@@ -221,3 +272,31 @@ gfx936,no_quant,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,2844.9048
gfx936,no_quant,torch.float16,16384,128,3072,256,8,0,0,asm,13001+23001,3597.2571
gfx936,no_quant,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,5205.65
gfx936,no_quant,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,6847.9883
gfx936,no_quant,torch.bfloat16,1,384,2048,128,8,0,0,asm,10005+20000,57.5107
gfx936,no_quant,torch.bfloat16,2,384,2048,128,8,0,0,asm,10005+20000,86.1507
gfx936,no_quant,torch.bfloat16,4,384,2048,128,8,0,0,asm,10001+20000,137.9569
gfx936,no_quant,torch.bfloat16,8,384,2048,128,8,0,0,asm,10001+20000,230.5798
gfx936,no_quant,torch.bfloat16,16,384,2048,128,8,0,0,asm,10001+20000,352.5754
gfx936,no_quant,torch.bfloat16,32,384,2048,128,8,0,0,asm,10001+20000,436.6174
gfx936,no_quant,torch.bfloat16,48,384,2048,128,8,0,0,asm,10001+20001,490.5933
gfx936,no_quant,torch.bfloat16,64,384,2048,128,8,0,0,asm,10001+20001,508.85309
gfx936,no_quant,torch.bfloat16,96,384,2048,128,8,0,0,asm,10001+20001,510.02899
gfx936,no_quant,torch.bfloat16,128,384,2048,128,8,0,0,asm,10001+20001,517.6922
gfx936,no_quant,torch.bfloat16,200,384,2048,128,8,0,0,asm,11000+21001,564.4711
gfx936,no_quant,torch.bfloat16,256,384,2048,128,8,0,0,asm,11000+21001,580.3952
gfx936,no_quant,torch.bfloat16,384,384,2048,128,8,0,0,asm,11000+21001,635.8056
gfx936,no_quant,torch.bfloat16,460,384,2048,128,8,0,0,asm,11000+21001,672.3782
gfx936,no_quant,torch.bfloat16,512,384,2048,128,8,0,0,asm,11006+20002,695.6287
gfx936,no_quant,torch.bfloat16,798,384,2048,128,8,0,0,asm,12004+22001,731.5276
gfx936,no_quant,torch.bfloat16,1024,384,2048,128,8,0,0,asm,12000+22001,779.5612
gfx936,no_quant,torch.bfloat16,1280,384,2048,128,8,0,0,asm,12000+22001,832.0495
gfx936,no_quant,torch.bfloat16,1440,384,2048,128,8,0,0,asm,13000+22001,891.2665
gfx936,no_quant,torch.bfloat16,1560,384,2048,128,8,0,0,asm,12004+22001,882.4158
gfx936,no_quant,torch.bfloat16,1880,384,2048,128,8,0,0,asm,13000+22001,885.6415
gfx936,no_quant,torch.bfloat16,2000,384,2048,128,8,0,0,asm,13000+23001,919.6288
gfx936,no_quant,torch.bfloat16,2200,384,2048,128,8,0,0,asm,12005+22001,965.1782
gfx936,no_quant,torch.bfloat16,2400,384,2048,128,8,0,0,asm,12001+22001,999.2413
gfx936,no_quant,torch.bfloat16,2800,384,2048,128,8,0,0,asm,13001+23001,1065.1948
gfx936,no_quant,torch.bfloat16,3200,384,2048,128,8,0,0,asm,13001+23001,1126.6853
gfx936,no_quant,torch.bfloat16,3660,384,2048,128,8,0,0,asm,13001+23001,1216.6051
gfx936,no_quant,torch.bfloat16,4096,384,2048,128,8,0,0,asm,13001+23001,1259.6619
\ No newline at end of file
......@@ -1042,3 +1042,1511 @@ gfx936,int8_w8a8_channel,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,
gfx936,int8_w8a8_channel,torch.float16,16384,128,3072,256,8,0,0,asm,13001+23001,3264.3642
gfx936,int8_w8a8_channel,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,4759.7737
gfx936,int8_w8a8_channel,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,6279.1322
gfx938,f8_w8a8_channel,torch.bfloat16,1,512,4096,256,6,0,0,asm,10008+20000,64.9697
gfx938,f8_w8a8_channel,torch.bfloat16,2,512,4096,256,6,0,0,asm,10011+20000,93.5172
gfx938,f8_w8a8_channel,torch.bfloat16,3,512,4096,256,6,0,0,asm,10002+20000,132.2538
gfx938,f8_w8a8_channel,torch.bfloat16,4,512,4096,256,6,0,0,asm,10011+20000,155.6137
gfx938,f8_w8a8_channel,torch.bfloat16,5,512,4096,256,6,0,0,asm,10011+20000,179.6641
gfx938,f8_w8a8_channel,torch.bfloat16,6,512,4096,256,6,0,0,asm,10002+20000,220.5063
gfx938,f8_w8a8_channel,torch.bfloat16,7,512,4096,256,6,0,0,asm,10011+20000,239.3947
gfx938,f8_w8a8_channel,torch.bfloat16,8,512,4096,256,6,0,0,asm,10013+20000,259.5464
gfx938,f8_w8a8_channel,torch.bfloat16,9,512,4096,256,6,0,0,asm,10002+20000,299.5799
gfx938,f8_w8a8_channel,torch.bfloat16,10,512,4096,256,6,0,0,asm,10002+20000,320.0597
gfx938,f8_w8a8_channel,torch.bfloat16,11,512,4096,256,6,0,0,asm,10011+20000,333.7271
gfx938,f8_w8a8_channel,torch.bfloat16,12,512,4096,256,6,0,0,asm,10011+20000,342.6955
gfx938,f8_w8a8_channel,torch.bfloat16,13,512,4096,256,6,0,0,asm,10013+20000,359.1504
gfx938,f8_w8a8_channel,torch.bfloat16,14,512,4096,256,6,0,0,asm,10002+20000,395.1839
gfx938,f8_w8a8_channel,torch.bfloat16,15,512,4096,256,6,0,0,asm,10011+20000,410.9396
gfx938,f8_w8a8_channel,torch.bfloat16,16,512,4096,256,6,0,0,asm,10011+20000,420.0933
gfx938,f8_w8a8_channel,torch.bfloat16,17,512,4096,256,6,0,0,asm,10013+20000,427.3493
gfx938,f8_w8a8_channel,torch.bfloat16,18,512,4096,256,6,0,0,asm,10002+20000,453.3787
gfx938,f8_w8a8_channel,torch.bfloat16,20,512,4096,256,6,0,0,asm,11005+20000,490.2628
gfx938,f8_w8a8_channel,torch.bfloat16,24,512,4096,256,6,0,0,asm,11007+20000,567.5428
gfx938,f8_w8a8_channel,torch.bfloat16,28,512,4096,256,6,0,0,asm,10002+20000,655.0066
gfx938,f8_w8a8_channel,torch.bfloat16,32,512,4096,256,6,0,0,asm,10002+20000,726.0803
gfx938,f8_w8a8_channel,torch.bfloat16,34,512,4096,256,6,0,0,asm,10013+20000,734.8098
gfx938,f8_w8a8_channel,torch.bfloat16,36,512,4096,256,6,0,0,asm,10009+20000,775.795
gfx938,f8_w8a8_channel,torch.bfloat16,40,512,4096,256,6,0,0,asm,11007+20000,818.0771
gfx938,f8_w8a8_channel,torch.bfloat16,44,512,4096,256,6,0,0,asm,11007+20000,887.8286
gfx938,f8_w8a8_channel,torch.bfloat16,48,512,4096,256,6,0,0,asm,11004+20000,932.4768
gfx938,f8_w8a8_channel,torch.bfloat16,56,512,4096,256,6,0,0,asm,10013+20000,986.0515
gfx938,f8_w8a8_channel,torch.bfloat16,64,512,4096,256,6,0,0,asm,10013+20000,1050.5229
gfx938,f8_w8a8_channel,torch.bfloat16,68,512,4096,256,6,0,0,asm,10013+20000,1069.2597
gfx938,f8_w8a8_channel,torch.bfloat16,72,512,4096,256,6,0,0,asm,10013+20000,1122.986
gfx938,f8_w8a8_channel,torch.bfloat16,80,512,4096,256,6,0,0,asm,10013+20000,1141.0322
gfx938,f8_w8a8_channel,torch.bfloat16,88,512,4096,256,6,0,0,asm,10013+20000,1176.19
gfx938,f8_w8a8_channel,torch.bfloat16,96,512,4096,256,6,0,0,asm,10013+20000,1191.3393
gfx938,f8_w8a8_channel,torch.bfloat16,104,512,4096,256,6,0,0,asm,10013+20000,1208.3246
gfx938,f8_w8a8_channel,torch.bfloat16,112,512,4096,256,6,0,0,asm,10013+20000,1223.2634
gfx938,f8_w8a8_channel,torch.bfloat16,128,512,4096,256,6,0,0,asm,10013+20000,1256.1306
gfx938,f8_w8a8_channel,torch.bfloat16,144,512,4096,256,6,0,0,asm,10013+20000,1273.7726
gfx938,f8_w8a8_channel,torch.bfloat16,160,512,4096,256,6,0,0,asm,10013+20000,1337.7641
gfx938,f8_w8a8_channel,torch.bfloat16,192,512,4096,256,6,0,0,asm,10013+20000,1323.9031
gfx938,f8_w8a8_channel,torch.bfloat16,224,512,4096,256,6,0,0,asm,10013+20000,1333.9326
gfx938,f8_w8a8_channel,torch.bfloat16,256,512,4096,256,6,0,0,asm,10013+20000,1343.3811
gfx938,f8_w8a8_channel,torch.bfloat16,320,512,4096,256,6,0,0,asm,10013+20000,1357.4695
gfx938,f8_w8a8_channel,torch.bfloat16,384,512,4096,256,6,0,0,asm,10013+20000,1423.6841
gfx938,f8_w8a8_channel,torch.bfloat16,448,512,4096,256,6,0,0,asm,12001+22000,1404.2484
gfx938,f8_w8a8_channel,torch.bfloat16,512,512,4096,256,6,0,0,asm,12005+22000,1424.821
gfx938,f8_w8a8_channel,torch.bfloat16,576,512,4096,256,6,0,0,asm,12005+22000,1483.4819
gfx938,f8_w8a8_channel,torch.bfloat16,640,512,4096,256,6,0,0,asm,12005+22000,1449.8736
gfx938,f8_w8a8_channel,torch.bfloat16,704,512,4096,256,6,0,0,asm,12005+22000,1460.4673
gfx938,f8_w8a8_channel,torch.bfloat16,768,512,4096,256,6,0,0,asm,11007+21000,1516.282
gfx938,f8_w8a8_channel,torch.bfloat16,832,512,4096,256,6,0,0,asm,12001+22000,1484.9305
gfx938,f8_w8a8_channel,torch.bfloat16,896,512,4096,256,6,0,0,asm,12005+22000,1498.9684
gfx938,f8_w8a8_channel,torch.bfloat16,960,512,4096,256,6,0,0,asm,12001+22000,1515.9789
gfx938,f8_w8a8_channel,torch.bfloat16,1024,512,4096,256,6,0,0,asm,12001+22000,1530.6652
gfx938,f8_w8a8_channel,torch.bfloat16,1152,512,4096,256,6,0,0,asm,12001+22000,1603.4229
gfx938,f8_w8a8_channel,torch.bfloat16,1280,512,4096,256,6,0,0,asm,12001+22000,1593.6629
gfx938,f8_w8a8_channel,torch.bfloat16,1408,512,4096,256,6,0,0,asm,12001+22000,1618.6649
gfx938,f8_w8a8_channel,torch.bfloat16,1536,512,4096,256,6,0,0,asm,12001+22000,1649.2079
gfx938,f8_w8a8_channel,torch.bfloat16,1664,512,4096,256,6,0,0,asm,12001+22000,1689.0732
gfx938,f8_w8a8_channel,torch.bfloat16,1792,512,4096,256,6,0,0,asm,12001+22000,1696.9302
gfx938,f8_w8a8_channel,torch.bfloat16,1920,512,4096,256,6,0,0,asm,12001+22000,1729.9067
gfx938,f8_w8a8_channel,torch.bfloat16,2048,512,4096,256,6,0,0,asm,12001+22000,1758.7655
gfx938,f8_w8a8_channel,torch.bfloat16,2304,512,4096,256,6,0,0,asm,12005+22000,1826.9085
gfx938,f8_w8a8_channel,torch.bfloat16,2560,512,4096,256,6,0,0,asm,12001+22000,1889.4516
gfx938,f8_w8a8_channel,torch.bfloat16,2816,512,4096,256,6,0,0,asm,13001+23001,2053.8806
gfx938,f8_w8a8_channel,torch.bfloat16,3072,512,4096,256,6,0,0,asm,13001+23001,2078.0912
gfx938,f8_w8a8_channel,torch.bfloat16,3328,512,4096,256,6,0,0,asm,13001+23001,2136.8785
gfx938,f8_w8a8_channel,torch.bfloat16,3584,512,4096,256,6,0,0,asm,13001+23001,2151.1017
gfx938,f8_w8a8_channel,torch.bfloat16,3840,512,4096,256,6,0,0,asm,13001+23001,2190.1756
gfx938,f8_w8a8_channel,torch.bfloat16,4096,512,4096,256,6,0,0,asm,13001+23001,2215.1607
gfx938,f8_w8a8_channel,torch.bfloat16,4608,512,4096,256,6,0,0,asm,13001+23001,2345.1226
gfx938,f8_w8a8_channel,torch.bfloat16,5120,512,4096,256,6,0,0,asm,13001+23001,2609.8883
gfx938,f8_w8a8_channel,torch.bfloat16,5632,512,4096,256,6,0,0,asm,13001+23001,3160.4807
gfx938,f8_w8a8_channel,torch.bfloat16,6144,512,4096,256,6,0,0,asm,12001+22001,3582.846
gfx938,f8_w8a8_channel,torch.bfloat16,6656,512,4096,256,6,0,0,asm,12001+22001,3712.0919
gfx938,f8_w8a8_channel,torch.bfloat16,7168,512,4096,256,6,0,0,asm,12001+22001,3825.0181
gfx938,f8_w8a8_channel,torch.bfloat16,7680,512,4096,256,6,0,0,asm,12001+22001,4007.3421
gfx938,f8_w8a8_channel,torch.bfloat16,8192,512,4096,256,6,0,0,asm,13001+23001,4081.6409
gfx938,f8_w8a8_channel,torch.bfloat16,10240,512,4096,256,6,0,0,asm,13001+23001,4520.1242
gfx938,f8_w8a8_channel,torch.bfloat16,12288,512,4096,256,6,0,0,asm,13001+23001,5796.0966
gfx938,f8_w8a8_channel,torch.bfloat16,14336,512,4096,256,6,0,0,asm,13001+23001,6163.6752
gfx938,f8_w8a8_channel,torch.bfloat16,16384,512,4096,256,6,0,0,asm,13001+23001,6976.4906
gfx938,f8_w8a8_channel,torch.bfloat16,17408,512,4096,256,6,0,0,asm,13001+23001,7636.5647
gfx938,f8_w8a8_channel,torch.bfloat16,24576,512,4096,256,6,0,0,asm,13001+23001,10009.1398
gfx938,f8_w8a8_channel,torch.bfloat16,32768,512,4096,256,6,0,0,asm,13001+23001,13063.6081
gfx938,f8_w8a8_channel,torch.bfloat16,40960,512,4096,256,6,0,0,asm,13001+23001,16152.603299999999
gfx938,f8_w8a8_channel,torch.bfloat16,49152,512,4096,256,6,0,0,asm,13001+23001,19138.8131
gfx938,f8_w8a8_channel,torch.bfloat16,57344,512,4096,256,6,0,0,asm,13001+23001,22122.0234
gfx938,f8_w8a8_channel,torch.bfloat16,65536,512,4096,256,6,0,0,asm,13001+23001,25145.5661
gfx938,f8_w8a8_channel,torch.bfloat16,1,256,4096,256,6,0,0,asm,10002+20000,49.0539
gfx938,f8_w8a8_channel,torch.bfloat16,2,256,4096,256,6,0,0,asm,10008+20000,69.6096
gfx938,f8_w8a8_channel,torch.bfloat16,3,256,4096,256,6,0,0,asm,10011+20000,81.9043
gfx938,f8_w8a8_channel,torch.bfloat16,4,256,4096,256,6,0,0,asm,10011+20000,96.3972
gfx938,f8_w8a8_channel,torch.bfloat16,5,256,4096,256,6,0,0,asm,10013+20000,108.3464
gfx938,f8_w8a8_channel,torch.bfloat16,6,256,4096,256,6,0,0,asm,10002+20000,135.0663
gfx938,f8_w8a8_channel,torch.bfloat16,7,256,4096,256,6,0,0,asm,10008+20000,146.2664
gfx938,f8_w8a8_channel,torch.bfloat16,8,256,4096,256,6,0,0,asm,10009+20000,154.9147
gfx938,f8_w8a8_channel,torch.bfloat16,9,256,4096,256,6,0,0,asm,10011+20000,164.0599
gfx938,f8_w8a8_channel,torch.bfloat16,10,256,4096,256,6,0,0,asm,10011+20000,173.3991
gfx938,f8_w8a8_channel,torch.bfloat16,11,256,4096,256,6,0,0,asm,10012+20000,182.8562
gfx938,f8_w8a8_channel,torch.bfloat16,12,256,4096,256,6,0,0,asm,10011+20000,187.5294
gfx938,f8_w8a8_channel,torch.bfloat16,13,256,4096,256,6,0,0,asm,10011+20000,196.4389
gfx938,f8_w8a8_channel,torch.bfloat16,14,256,4096,256,6,0,0,asm,10002+20000,224.5231
gfx938,f8_w8a8_channel,torch.bfloat16,15,256,4096,256,6,0,0,asm,10009+20000,234.4093
gfx938,f8_w8a8_channel,torch.bfloat16,16,256,4096,256,6,0,0,asm,10008+20000,236.0515
gfx938,f8_w8a8_channel,torch.bfloat16,17,256,4096,256,6,0,0,asm,10011+20000,232.4528
gfx938,f8_w8a8_channel,torch.bfloat16,18,256,4096,256,6,0,0,asm,10011+20000,238.6592
gfx938,f8_w8a8_channel,torch.bfloat16,20,256,4096,256,6,0,0,asm,10011+20000,260.2927
gfx938,f8_w8a8_channel,torch.bfloat16,24,256,4096,256,6,0,0,asm,10011+20000,306.3895
gfx938,f8_w8a8_channel,torch.bfloat16,28,256,4096,256,6,0,0,asm,10011+20000,343.4197
gfx938,f8_w8a8_channel,torch.bfloat16,32,256,4096,256,6,0,0,asm,10002+20000,396.7588
gfx938,f8_w8a8_channel,torch.bfloat16,34,256,4096,256,6,0,0,asm,10011+20000,389.6483
gfx938,f8_w8a8_channel,torch.bfloat16,36,256,4096,256,6,0,0,asm,10011+20000,393.9598
gfx938,f8_w8a8_channel,torch.bfloat16,40,256,4096,256,6,0,0,asm,10013+20000,413.7998
gfx938,f8_w8a8_channel,torch.bfloat16,44,256,4096,256,6,0,0,asm,10011+20000,469.0166
gfx938,f8_w8a8_channel,torch.bfloat16,48,256,4096,256,6,0,0,asm,10011+20000,478.7007
gfx938,f8_w8a8_channel,torch.bfloat16,56,256,4096,256,6,0,0,asm,10013+20000,496.7975
gfx938,f8_w8a8_channel,torch.bfloat16,64,256,4096,256,6,0,0,asm,10013+20000,543.4332
gfx938,f8_w8a8_channel,torch.bfloat16,68,256,4096,256,6,0,0,asm,10013+20000,544.5616
gfx938,f8_w8a8_channel,torch.bfloat16,72,256,4096,256,6,0,0,asm,11007+21000,570.2289
gfx938,f8_w8a8_channel,torch.bfloat16,80,256,4096,256,6,0,0,asm,10013+20000,567.2564
gfx938,f8_w8a8_channel,torch.bfloat16,88,256,4096,256,6,0,0,asm,10013+20000,596.8983
gfx938,f8_w8a8_channel,torch.bfloat16,96,256,4096,256,6,0,0,asm,10013+20000,617.5636
gfx938,f8_w8a8_channel,torch.bfloat16,104,256,4096,256,6,0,0,asm,10013+20000,621.3699
gfx938,f8_w8a8_channel,torch.bfloat16,112,256,4096,256,6,0,0,asm,10013+20000,625.2604
gfx938,f8_w8a8_channel,torch.bfloat16,128,256,4096,256,6,0,0,asm,10013+20000,638.4225
gfx938,f8_w8a8_channel,torch.bfloat16,144,256,4096,256,6,0,0,asm,10011+20000,645.5047
gfx938,f8_w8a8_channel,torch.bfloat16,160,256,4096,256,6,0,0,asm,10011+20000,705.6814
gfx938,f8_w8a8_channel,torch.bfloat16,192,256,4096,256,6,0,0,asm,10013+20000,674.2035
gfx938,f8_w8a8_channel,torch.bfloat16,224,256,4096,256,6,0,0,asm,11007+21000,683.3825
gfx938,f8_w8a8_channel,torch.bfloat16,256,256,4096,256,6,0,0,asm,11007+21000,684.8983
gfx938,f8_w8a8_channel,torch.bfloat16,320,256,4096,256,6,0,0,asm,11007+21000,699.3994
gfx938,f8_w8a8_channel,torch.bfloat16,384,256,4096,256,6,0,0,asm,11007+21000,713.1762
gfx938,f8_w8a8_channel,torch.bfloat16,448,256,4096,256,6,0,0,asm,11007+21000,723.7025
gfx938,f8_w8a8_channel,torch.bfloat16,512,256,4096,256,6,0,0,asm,11007+21000,738.852
gfx938,f8_w8a8_channel,torch.bfloat16,576,256,4096,256,6,0,0,asm,11007+21000,799.3572
gfx938,f8_w8a8_channel,torch.bfloat16,640,256,4096,256,6,0,0,asm,11007+21000,762.5656
gfx938,f8_w8a8_channel,torch.bfloat16,704,256,4096,256,6,0,0,asm,11007+21000,778.4477
gfx938,f8_w8a8_channel,torch.bfloat16,768,256,4096,256,6,0,0,asm,11007+21000,839.9381
gfx938,f8_w8a8_channel,torch.bfloat16,832,256,4096,256,6,0,0,asm,11007+21000,799.0623
gfx938,f8_w8a8_channel,torch.bfloat16,896,256,4096,256,6,0,0,asm,11007+21000,836.7212
gfx938,f8_w8a8_channel,torch.bfloat16,960,256,4096,256,6,0,0,asm,11007+21000,848.0727
gfx938,f8_w8a8_channel,torch.bfloat16,1024,256,4096,256,6,0,0,asm,11007+21000,856.5527
gfx938,f8_w8a8_channel,torch.bfloat16,1152,256,4096,256,6,0,0,asm,11005+21000,924.4178
gfx938,f8_w8a8_channel,torch.bfloat16,1280,256,4096,256,6,0,0,asm,12005+22001,921.0157
gfx938,f8_w8a8_channel,torch.bfloat16,1408,256,4096,256,6,0,0,asm,12005+22001,944.8219
gfx938,f8_w8a8_channel,torch.bfloat16,1536,256,4096,256,6,0,0,asm,12005+22001,1006.0514000000001
gfx938,f8_w8a8_channel,torch.bfloat16,1664,256,4096,256,6,0,0,asm,12005+22001,996.0051
gfx938,f8_w8a8_channel,torch.bfloat16,1792,256,4096,256,6,0,0,asm,12001+22001,1008.4261000000001
gfx938,f8_w8a8_channel,torch.bfloat16,1920,256,4096,256,6,0,0,asm,12005+22001,1039.9376
gfx938,f8_w8a8_channel,torch.bfloat16,2048,256,4096,256,6,0,0,asm,12005+22001,1055.2724
gfx938,f8_w8a8_channel,torch.bfloat16,2304,256,4096,256,6,0,0,asm,12005+22001,1132.6533
gfx938,f8_w8a8_channel,torch.bfloat16,2560,256,4096,256,6,0,0,asm,12005+22001,1180.8385
gfx938,f8_w8a8_channel,torch.bfloat16,2816,256,4096,256,6,0,0,asm,12001+22001,1316.072
gfx938,f8_w8a8_channel,torch.bfloat16,3072,256,4096,256,6,0,0,asm,13001+23001,1364.4256
gfx938,f8_w8a8_channel,torch.bfloat16,3328,256,4096,256,6,0,0,asm,13001+23001,1402.0171
gfx938,f8_w8a8_channel,torch.bfloat16,3584,256,4096,256,6,0,0,asm,13001+23001,1428.0044
gfx938,f8_w8a8_channel,torch.bfloat16,3840,256,4096,256,6,0,0,asm,13001+23001,1458.6064
gfx938,f8_w8a8_channel,torch.bfloat16,4096,256,4096,256,6,0,0,asm,13001+23001,1480.5179
gfx938,f8_w8a8_channel,torch.bfloat16,4608,256,4096,256,6,0,0,asm,13001+23001,1593.1746
gfx938,f8_w8a8_channel,torch.bfloat16,5120,256,4096,256,6,0,0,asm,13001+23001,1768.6016
gfx938,f8_w8a8_channel,torch.bfloat16,5632,256,4096,256,6,0,0,asm,13001+23001,2115.3041
gfx938,f8_w8a8_channel,torch.bfloat16,6144,256,4096,256,6,0,0,asm,12001+22001,2367.6908
gfx938,f8_w8a8_channel,torch.bfloat16,6656,256,4096,256,6,0,0,asm,12001+22001,2451.6906
gfx938,f8_w8a8_channel,torch.bfloat16,7168,256,4096,256,6,0,0,asm,12001+22001,2551.1766
gfx938,f8_w8a8_channel,torch.bfloat16,7680,256,4096,256,6,0,0,asm,12001+22001,2670.5869
gfx938,f8_w8a8_channel,torch.bfloat16,8192,256,4096,256,6,0,0,asm,13001+23001,2771.5974
gfx938,f8_w8a8_channel,torch.bfloat16,10240,256,4096,256,6,0,0,asm,13001+23001,3116.8093
gfx938,f8_w8a8_channel,torch.bfloat16,12288,256,4096,256,6,0,0,asm,13001+23001,4024.8074999999994
gfx938,f8_w8a8_channel,torch.bfloat16,14336,256,4096,256,6,0,0,asm,13001+23001,4314.9881
gfx938,f8_w8a8_channel,torch.bfloat16,16384,256,4096,256,6,0,0,asm,13001+23001,4910.3977
gfx938,f8_w8a8_channel,torch.bfloat16,17408,256,4096,256,6,0,0,asm,13001+23001,5376.1151
gfx938,f8_w8a8_channel,torch.bfloat16,24576,256,4096,256,6,0,0,asm,13001+23001,7071.3878
gfx938,f8_w8a8_channel,torch.bfloat16,32768,256,4096,256,6,0,0,asm,13001+23001,9263.7619
gfx938,f8_w8a8_channel,torch.bfloat16,40960,256,4096,256,6,0,0,asm,13001+23001,11379.3186
gfx938,f8_w8a8_channel,torch.bfloat16,49152,256,4096,256,6,0,0,asm,13001+23001,13529.0724
gfx938,f8_w8a8_channel,torch.bfloat16,57344,256,4096,256,6,0,0,asm,13001+23001,15696.9227
gfx938,f8_w8a8_channel,torch.bfloat16,65536,256,4096,256,6,0,0,asm,13001+23001,17822.9558
gfx938,f8_w8a8_channel,torch.bfloat16,65536,256,4096,256,6,0,0,asm,13001+23001,17822.9558
gfx938,f8_w8a8_channel,torch.bfloat16,1,128,4096,512,10,0,0,asm,10002+20100,61.3404
gfx938,f8_w8a8_channel,torch.bfloat16,2,128,4096,512,10,0,0,asm,10008+20101,79.4288
gfx938,f8_w8a8_channel,torch.bfloat16,3,128,4096,512,10,0,0,asm,10008+20101,89.4075
gfx938,f8_w8a8_channel,torch.bfloat16,4,128,4096,512,10,0,0,asm,10011+20001,101.8791
gfx938,f8_w8a8_channel,torch.bfloat16,5,128,4096,512,10,0,0,asm,10011+20001,110.3675
gfx938,f8_w8a8_channel,torch.bfloat16,6,128,4096,512,10,0,0,asm,10011+20101,117.7528
gfx938,f8_w8a8_channel,torch.bfloat16,7,128,4096,512,10,0,0,asm,10013+20101,127.9168
gfx938,f8_w8a8_channel,torch.bfloat16,8,128,4096,512,10,0,0,asm,10002+20001,156.7674
gfx938,f8_w8a8_channel,torch.bfloat16,9,128,4096,512,10,0,0,asm,10002+20101,169.3063
gfx938,f8_w8a8_channel,torch.bfloat16,10,128,4096,512,10,0,0,asm,10008+20101,179.1253
gfx938,f8_w8a8_channel,torch.bfloat16,11,128,4096,512,10,0,0,asm,10011+20101,186.9651
gfx938,f8_w8a8_channel,torch.bfloat16,12,128,4096,512,10,0,0,asm,10011+20001,192.6495
gfx938,f8_w8a8_channel,torch.bfloat16,13,128,4096,512,10,0,0,asm,10011+20101,204.7336
gfx938,f8_w8a8_channel,torch.bfloat16,14,128,4096,512,10,0,0,asm,10011+20101,214.9736
gfx938,f8_w8a8_channel,torch.bfloat16,15,128,4096,512,10,0,0,asm,10002+20001,247.1083
gfx938,f8_w8a8_channel,torch.bfloat16,16,128,4096,512,10,0,0,asm,10002+20001,257.2555
gfx938,f8_w8a8_channel,torch.bfloat16,17,128,4096,512,10,0,0,asm,10011+20101,242.44
gfx938,f8_w8a8_channel,torch.bfloat16,18,128,4096,512,10,0,0,asm,10011+20101,247.6779
gfx938,f8_w8a8_channel,torch.bfloat16,20,128,4096,512,10,0,0,asm,10011+20001,256.4443
gfx938,f8_w8a8_channel,torch.bfloat16,24,128,4096,512,10,0,0,asm,10002+20101,309.7493
gfx938,f8_w8a8_channel,torch.bfloat16,28,128,4096,512,10,0,0,asm,10011+20101,327.4082
gfx938,f8_w8a8_channel,torch.bfloat16,32,128,4096,512,10,0,0,asm,10011+20101,389.7154
gfx938,f8_w8a8_channel,torch.bfloat16,34,128,4096,512,10,0,0,asm,10012+20101,357.9008
gfx938,f8_w8a8_channel,torch.bfloat16,36,128,4096,512,10,0,0,asm,10002+20101,393.1261
gfx938,f8_w8a8_channel,torch.bfloat16,40,128,4096,512,10,0,0,asm,10011+20101,398.0945
gfx938,f8_w8a8_channel,torch.bfloat16,44,128,4096,512,10,0,0,asm,10011+20101,415.5933
gfx938,f8_w8a8_channel,torch.bfloat16,48,128,4096,512,10,0,0,asm,10013+20101,429.2944
gfx938,f8_w8a8_channel,torch.bfloat16,56,128,4096,512,10,0,0,asm,10011+20101,477.2353
gfx938,f8_w8a8_channel,torch.bfloat16,64,128,4096,512,10,0,0,asm,10012+20101,511.1384
gfx938,f8_w8a8_channel,torch.bfloat16,68,128,4096,512,10,0,0,asm,10013+20101,503.0374
gfx938,f8_w8a8_channel,torch.bfloat16,72,128,4096,512,10,0,0,asm,10011+20001,560.5698
gfx938,f8_w8a8_channel,torch.bfloat16,80,128,4096,512,10,0,0,asm,10013+20101,558.2371
gfx938,f8_w8a8_channel,torch.bfloat16,88,128,4096,512,10,0,0,asm,10011+20101,559.5003
gfx938,f8_w8a8_channel,torch.bfloat16,96,128,4096,512,10,0,0,asm,10013+20101,580.7802
gfx938,f8_w8a8_channel,torch.bfloat16,104,128,4096,512,10,0,0,asm,10013+20101,618.5823
gfx938,f8_w8a8_channel,torch.bfloat16,112,128,4096,512,10,0,0,asm,10011+20101,633.9506
gfx938,f8_w8a8_channel,torch.bfloat16,128,128,4096,512,10,0,0,asm,10013+20101,640.2074
gfx938,f8_w8a8_channel,torch.bfloat16,144,128,4096,512,10,0,0,asm,10013+20101,651.0538
gfx938,f8_w8a8_channel,torch.bfloat16,160,128,4096,512,10,0,0,asm,10013+20101,662.6833
gfx938,f8_w8a8_channel,torch.bfloat16,192,128,4096,512,10,0,0,asm,10011+20101,671.2138
gfx938,f8_w8a8_channel,torch.bfloat16,224,128,4096,512,10,0,0,asm,10011+20101,679.5591
gfx938,f8_w8a8_channel,torch.bfloat16,256,128,4096,512,10,0,0,asm,10013+20101,691.0874
gfx938,f8_w8a8_channel,torch.bfloat16,320,128,4096,512,10,0,0,asm,10013+20101,710.0768
gfx938,f8_w8a8_channel,torch.bfloat16,384,128,4096,512,10,0,0,asm,10011+20101,720.1315
gfx938,f8_w8a8_channel,torch.bfloat16,448,128,4096,512,10,0,0,asm,11007+21101,739.2556
gfx938,f8_w8a8_channel,torch.bfloat16,512,128,4096,512,10,0,0,asm,11007+21101,755.2641
gfx938,f8_w8a8_channel,torch.bfloat16,576,128,4096,512,10,0,0,asm,11007+21101,815.3902
gfx938,f8_w8a8_channel,torch.bfloat16,640,128,4096,512,10,0,0,asm,11007+21101,775.8956
gfx938,f8_w8a8_channel,torch.bfloat16,704,128,4096,512,10,0,0,asm,11007+21101,788.7293
gfx938,f8_w8a8_channel,torch.bfloat16,768,128,4096,512,10,0,0,asm,11007+21101,797.6977
gfx938,f8_w8a8_channel,torch.bfloat16,832,128,4096,512,10,0,0,asm,11007+21101,804.5356
gfx938,f8_w8a8_channel,torch.bfloat16,896,128,4096,512,10,0,0,asm,11007+21101,813.3018
gfx938,f8_w8a8_channel,torch.bfloat16,960,128,4096,512,10,0,0,asm,11007+21101,827.7018
gfx938,f8_w8a8_channel,torch.bfloat16,1024,128,4096,512,10,0,0,asm,11007+21101,859.8112
gfx938,f8_w8a8_channel,torch.bfloat16,1152,128,4096,512,10,0,0,asm,11005+21101,906.9689
gfx938,f8_w8a8_channel,torch.bfloat16,1280,128,4096,512,10,0,0,asm,11007+21101,891.1711
gfx938,f8_w8a8_channel,torch.bfloat16,1408,128,4096,512,10,0,0,asm,11007+21101,941.4109
gfx938,f8_w8a8_channel,torch.bfloat16,1536,128,4096,512,10,0,0,asm,11005+21101,992.2741
gfx938,f8_w8a8_channel,torch.bfloat16,1664,128,4096,512,10,0,0,asm,11003+21101,1042.8424
gfx938,f8_w8a8_channel,torch.bfloat16,1792,128,4096,512,10,0,0,asm,11005+21101,1146.0506
gfx938,f8_w8a8_channel,torch.bfloat16,1920,128,4096,512,10,0,0,asm,12001+21101,1143.2548
gfx938,f8_w8a8_channel,torch.bfloat16,2048,128,4096,512,10,0,0,asm,12005+21101,1165.4021
gfx938,f8_w8a8_channel,torch.bfloat16,2304,128,4096,512,10,0,0,asm,12005+21101,1217.7977
gfx938,f8_w8a8_channel,torch.bfloat16,2560,128,4096,512,10,0,0,asm,12001+21101,1269.5702
gfx938,f8_w8a8_channel,torch.bfloat16,2816,128,4096,512,10,0,0,asm,12005+21101,1345.1573
gfx938,f8_w8a8_channel,torch.bfloat16,3072,128,4096,512,10,0,0,asm,12005+21101,1456.5676
gfx938,f8_w8a8_channel,torch.bfloat16,3328,128,4096,512,10,0,0,asm,12001+22101,1596.6935
gfx938,f8_w8a8_channel,torch.bfloat16,3584,128,4096,512,10,0,0,asm,12001+22101,1700.3058
gfx938,f8_w8a8_channel,torch.bfloat16,3840,128,4096,512,10,0,0,asm,13001+22101,1797.0805
gfx938,f8_w8a8_channel,torch.bfloat16,4096,128,4096,512,10,0,0,asm,13001+22101,1808.7774
gfx938,f8_w8a8_channel,torch.bfloat16,4608,128,4096,512,10,0,0,asm,13001+22101,1898.4193
gfx938,f8_w8a8_channel,torch.bfloat16,5120,128,4096,512,10,0,0,asm,13001+22101,2009.0547000000001
gfx938,f8_w8a8_channel,torch.bfloat16,5632,128,4096,512,10,0,0,asm,13001+22101,2101.2144
gfx938,f8_w8a8_channel,torch.bfloat16,6144,128,4096,512,10,0,0,asm,13001+22101,2300.4644
gfx938,f8_w8a8_channel,torch.bfloat16,6656,128,4096,512,10,0,0,asm,12001+22101,2613.3645
gfx938,f8_w8a8_channel,torch.bfloat16,7168,128,4096,512,10,0,0,asm,12001+22101,2836.1681
gfx938,f8_w8a8_channel,torch.bfloat16,7680,128,4096,512,10,0,0,asm,12001+22101,3003.8056
gfx938,f8_w8a8_channel,torch.bfloat16,8192,128,4096,512,10,0,0,asm,12001+22101,3106.1971
gfx938,f8_w8a8_channel,torch.bfloat16,9008,128,4096,512,10,0,0,asm,12001+22101,3256.4307
gfx938,f8_w8a8_channel,torch.bfloat16,10240,128,4096,512,10,0,0,asm,13001+22101,3614.1959
gfx938,f8_w8a8_channel,torch.bfloat16,12288,128,4096,512,10,0,0,asm,13001+22101,4049.7062999999994
gfx938,f8_w8a8_channel,torch.bfloat16,14336,128,4096,512,10,0,0,asm,12001+22101,5020.8025
gfx938,f8_w8a8_channel,torch.bfloat16,16384,128,4096,512,10,0,0,asm,13001+22101,5391.9083
gfx938,f8_w8a8_channel,torch.bfloat16,17408,128,4096,512,10,0,0,asm,13001+22101,5574.7625
gfx938,f8_w8a8_channel,torch.bfloat16,24576,128,4096,512,10,0,0,asm,13001+23101,7529.52
gfx938,f8_w8a8_channel,torch.bfloat16,32768,128,4096,512,10,0,0,asm,13001+23101,9864.0308
gfx938,f8_w8a8_channel,torch.bfloat16,40960,128,4096,512,10,0,0,asm,13001+23101,12122.0694
gfx938,f8_w8a8_channel,torch.bfloat16,49152,128,4096,512,10,0,0,asm,13001+23101,14532.4632
gfx938,f8_w8a8_channel,torch.bfloat16,57344,128,4096,512,10,0,0,asm,13001+23101,15834.6738
gfx938,f8_w8a8_channel,torch.bfloat16,65536,128,4096,512,10,0,0,asm,13001+23101,18168.4834
gfx938,f8_w8a8_channel,torch.bfloat16,1,192,4096,192,8,0,0,asm,10002+20001,48.0604
gfx938,f8_w8a8_channel,torch.bfloat16,2,192,4096,192,8,0,0,asm,10008+20001,61.4835
gfx938,f8_w8a8_channel,torch.bfloat16,3,192,4096,192,8,0,0,asm,10008+20000,76.4645
gfx938,f8_w8a8_channel,torch.bfloat16,4,192,4096,192,8,0,0,asm,10011+20000,87.3277
gfx938,f8_w8a8_channel,torch.bfloat16,5,192,4096,192,8,0,0,asm,10008+20000,102.3091
gfx938,f8_w8a8_channel,torch.bfloat16,6,192,4096,192,8,0,0,asm,10008+20001,109.9972
gfx938,f8_w8a8_channel,torch.bfloat16,7,192,4096,192,8,0,0,asm,10002+20001,127.1341
gfx938,f8_w8a8_channel,torch.bfloat16,8,192,4096,192,8,0,0,asm,10002+20001,135.1593
gfx938,f8_w8a8_channel,torch.bfloat16,9,192,4096,192,8,0,0,asm,10008+20001,144.4141
gfx938,f8_w8a8_channel,torch.bfloat16,10,192,4096,192,8,0,0,asm,10011+20001,152.7594
gfx938,f8_w8a8_channel,torch.bfloat16,11,192,4096,192,8,0,0,asm,10002+20001,170.4348
gfx938,f8_w8a8_channel,torch.bfloat16,12,192,4096,192,8,0,0,asm,10002+20000,178.1488
gfx938,f8_w8a8_channel,torch.bfloat16,13,192,4096,192,8,0,0,asm,10002+20001,182.2583
gfx938,f8_w8a8_channel,torch.bfloat16,14,192,4096,192,8,0,0,asm,10002+20001,189.8372
gfx938,f8_w8a8_channel,torch.bfloat16,15,192,4096,192,8,0,0,asm,10008+20001,191.2014
gfx938,f8_w8a8_channel,torch.bfloat16,16,192,4096,192,8,0,0,asm,10008+20001,196.3971
gfx938,f8_w8a8_channel,torch.bfloat16,17,192,4096,192,8,0,0,asm,10011+20001,201.0932
gfx938,f8_w8a8_channel,torch.bfloat16,18,192,4096,192,8,0,0,asm,10011+20001,208.1921
gfx938,f8_w8a8_channel,torch.bfloat16,19,192,4096,192,8,0,0,asm,10011+20001,215.1816
gfx938,f8_w8a8_channel,torch.bfloat16,20,192,4096,192,8,0,0,asm,10008+20001,222.5584
gfx938,f8_w8a8_channel,torch.bfloat16,21,192,4096,192,8,0,0,asm,10009+20001,225.7163
gfx938,f8_w8a8_channel,torch.bfloat16,22,192,4096,192,8,0,0,asm,10008+20001,234.9288
gfx938,f8_w8a8_channel,torch.bfloat16,23,192,4096,192,8,0,0,asm,10002+20001,243.51
gfx938,f8_w8a8_channel,torch.bfloat16,24,192,4096,192,8,0,0,asm,10002+20001,252.2678
gfx938,f8_w8a8_channel,torch.bfloat16,25,192,4096,192,8,0,0,asm,10002+20001,261.2141
gfx938,f8_w8a8_channel,torch.bfloat16,26,192,4096,192,8,0,0,asm,10002+20001,264.0519
gfx938,f8_w8a8_channel,torch.bfloat16,27,192,4096,192,8,0,0,asm,10002+20001,275.4121
gfx938,f8_w8a8_channel,torch.bfloat16,28,192,4096,192,8,0,0,asm,10002+20001,276.5657
gfx938,f8_w8a8_channel,torch.bfloat16,29,192,4096,192,8,0,0,asm,10011+20001,276.0856
gfx938,f8_w8a8_channel,torch.bfloat16,30,192,4096,192,8,0,0,asm,10011+20001,280.5405
gfx938,f8_w8a8_channel,torch.bfloat16,31,192,4096,192,8,0,0,asm,10011+20001,282.5867
gfx938,f8_w8a8_channel,torch.bfloat16,32,192,4096,192,8,0,0,asm,10002+20001,297.3489
gfx938,f8_w8a8_channel,torch.bfloat16,34,192,4096,192,8,0,0,asm,10002+20001,291.4595
gfx938,f8_w8a8_channel,torch.bfloat16,36,192,4096,192,8,0,0,asm,10002+20001,293.5395
gfx938,f8_w8a8_channel,torch.bfloat16,40,192,4096,192,8,0,0,asm,10009+20001,303.7121
gfx938,f8_w8a8_channel,torch.bfloat16,50,192,4096,192,8,0,0,asm,10011+20001,321.9604
gfx938,f8_w8a8_channel,torch.bfloat16,60,192,4096,192,8,0,0,asm,10002+20001,334.0279
gfx938,f8_w8a8_channel,torch.bfloat16,64,192,4096,192,8,0,0,asm,10011+20001,336.2426
gfx938,f8_w8a8_channel,torch.bfloat16,68,192,4096,192,8,0,0,asm,10002+20001,345.1858
gfx938,f8_w8a8_channel,torch.bfloat16,72,192,4096,192,8,0,0,asm,10002+20001,348.0321
gfx938,f8_w8a8_channel,torch.bfloat16,80,192,4096,192,8,0,0,asm,10008+20001,347.3332
gfx938,f8_w8a8_channel,torch.bfloat16,88,192,4096,192,8,0,0,asm,10008+20001,349.8258
gfx938,f8_w8a8_channel,torch.bfloat16,96,192,4096,192,8,0,0,asm,10008+20001,357.59
gfx938,f8_w8a8_channel,torch.bfloat16,112,192,4096,192,8,0,0,asm,10008+20001,365.1857
gfx938,f8_w8a8_channel,torch.bfloat16,128,192,4096,192,8,0,0,asm,10002+20001,368.2005
gfx938,f8_w8a8_channel,torch.bfloat16,164,192,4096,192,8,0,0,asm,10002+20001,377.6742
gfx938,f8_w8a8_channel,torch.bfloat16,200,192,4096,192,8,0,0,asm,10002+20001,386.1121
gfx938,f8_w8a8_channel,torch.bfloat16,256,192,4096,192,8,0,0,asm,10002+20001,400.2932
gfx938,f8_w8a8_channel,torch.bfloat16,384,192,4096,192,8,0,0,asm,11004+21001,429.4721
gfx938,f8_w8a8_channel,torch.bfloat16,448,192,4096,192,8,0,0,asm,11004+21001,442.3479
gfx938,f8_w8a8_channel,torch.bfloat16,512,192,4096,192,8,0,0,asm,11004+21001,462.4826
gfx938,f8_w8a8_channel,torch.bfloat16,576,192,4096,192,8,0,0,asm,11004+21001,536.0068
gfx938,f8_w8a8_channel,torch.bfloat16,640,192,4096,192,8,0,0,asm,11004+21001,494.8532
gfx938,f8_w8a8_channel,torch.bfloat16,768,192,4096,192,8,0,0,asm,12000+22001,575.7121
gfx938,f8_w8a8_channel,torch.bfloat16,896,192,4096,192,8,0,0,asm,12000+22001,599.5942
gfx938,f8_w8a8_channel,torch.bfloat16,960,192,4096,192,8,0,0,asm,12000+22001,605.6405
gfx938,f8_w8a8_channel,torch.bfloat16,1024,192,4096,192,8,0,0,asm,12000+22001,613.4047
gfx938,f8_w8a8_channel,torch.bfloat16,1152,192,4096,192,8,0,0,asm,12000+22001,670.5163
gfx938,f8_w8a8_channel,torch.bfloat16,1280,192,4096,192,8,0,0,asm,12000+22001,682.4826
gfx938,f8_w8a8_channel,torch.bfloat16,1408,192,4096,192,8,0,0,asm,12000+22001,747.2995
gfx938,f8_w8a8_channel,torch.bfloat16,1536,192,4096,192,8,0,0,asm,12000+22001,844.7813
gfx938,f8_w8a8_channel,torch.bfloat16,1664,192,4096,192,8,0,0,asm,13000+23001,969.2786
gfx938,f8_w8a8_channel,torch.bfloat16,1920,192,4096,192,8,0,0,asm,13000+23001,1000.8655
gfx938,f8_w8a8_channel,torch.bfloat16,2048,192,4096,192,8,0,0,asm,13000+23001,1017.5138
gfx938,f8_w8a8_channel,torch.bfloat16,2304,192,4096,192,8,0,0,asm,13000+23001,1065.388
gfx938,f8_w8a8_channel,torch.bfloat16,2560,192,4096,192,8,0,0,asm,13001+23001,1106.7182
gfx938,f8_w8a8_channel,torch.bfloat16,2816,192,4096,192,8,0,0,asm,13001+23001,1242.9464
gfx938,f8_w8a8_channel,torch.bfloat16,3072,192,4096,192,8,0,0,asm,13001+23001,1430.5507
gfx938,f8_w8a8_channel,torch.bfloat16,3328,192,4096,192,8,0,0,asm,12000+22001,1591.2913
gfx938,f8_w8a8_channel,torch.bfloat16,3584,192,4096,192,8,0,0,asm,12000+22001,1686.8364
gfx938,f8_w8a8_channel,torch.bfloat16,3840,192,4096,192,8,0,0,asm,12000+22001,1746.9627
gfx938,f8_w8a8_channel,torch.bfloat16,4096,192,4096,192,8,0,0,asm,12000+22001,1789.9269
gfx938,f8_w8a8_channel,torch.bfloat16,4608,192,4096,192,8,0,0,asm,13000+23001,1990.3815
gfx938,f8_w8a8_channel,torch.bfloat16,5120,192,4096,192,8,0,0,asm,13001+23001,2063.0553
gfx938,f8_w8a8_channel,torch.bfloat16,5632,192,4096,192,8,0,0,asm,13001+23001,2188.8321
gfx938,f8_w8a8_channel,torch.bfloat16,6144,192,4096,192,8,0,0,asm,13001+23001,2473.7248
gfx938,f8_w8a8_channel,torch.bfloat16,6656,192,4096,192,8,0,0,asm,13001+23001,2799.1545
gfx938,f8_w8a8_channel,torch.bfloat16,7168,192,4096,192,8,0,0,asm,13001+23001,2912.2738
gfx938,f8_w8a8_channel,torch.bfloat16,7680,192,4096,192,8,0,0,asm,13001+23001,2974.8588
gfx938,f8_w8a8_channel,torch.bfloat16,8192,192,4096,192,8,0,0,asm,13001+23001,3091.6693
gfx938,f8_w8a8_channel,torch.bfloat16,10240,192,4096,192,8,0,0,asm,13001+23001,3955.2855
gfx938,f8_w8a8_channel,torch.bfloat16,12288,192,4096,192,8,0,0,asm,13001+23001,4556.5461
gfx938,f8_w8a8_channel,torch.bfloat16,14336,192,4096,192,8,0,0,asm,13001+23001,5127.8724
gfx938,f8_w8a8_channel,torch.bfloat16,16384,192,4096,192,8,0,0,asm,13001+23001,5982.643
gfx938,f8_w8a8_channel,torch.bfloat16,1,384,4096,192,8,0,0,asm,10008+20000,59.5804
gfx938,f8_w8a8_channel,torch.bfloat16,2,384,4096,192,8,0,0,asm,10011+20000,79.7909
gfx938,f8_w8a8_channel,torch.bfloat16,3,384,4096,192,8,0,0,asm,10011+20000,101.8709
gfx938,f8_w8a8_channel,torch.bfloat16,4,384,4096,192,8,0,0,asm,10002+20000,129.1467
gfx938,f8_w8a8_channel,torch.bfloat16,5,384,4096,192,8,0,0,asm,10011+20000,147.8331
gfx938,f8_w8a8_channel,torch.bfloat16,6,384,4096,192,8,0,0,asm,10011+20000,164.5404
gfx938,f8_w8a8_channel,torch.bfloat16,7,384,4096,192,8,0,0,asm,10002+20000,197.1467
gfx938,f8_w8a8_channel,torch.bfloat16,8,384,4096,192,8,0,0,asm,10008+20000,210.1236
gfx938,f8_w8a8_channel,torch.bfloat16,9,384,4096,192,8,0,0,asm,10012+20000,229.4667
gfx938,f8_w8a8_channel,torch.bfloat16,10,384,4096,192,8,0,0,asm,10011+20000,241.9297
gfx938,f8_w8a8_channel,torch.bfloat16,11,384,4096,192,8,0,0,asm,10002+20000,281.2393
gfx938,f8_w8a8_channel,torch.bfloat16,12,384,4096,192,8,0,0,asm,10002+20000,288.9109
gfx938,f8_w8a8_channel,torch.bfloat16,13,384,4096,192,8,0,0,asm,10002+20000,302.1067
gfx938,f8_w8a8_channel,torch.bfloat16,14,384,4096,192,8,0,0,asm,10011+20000,307.2604
gfx938,f8_w8a8_channel,torch.bfloat16,15,384,4096,192,8,0,0,asm,10011+20000,311.4204
gfx938,f8_w8a8_channel,torch.bfloat16,16,384,4096,192,8,0,0,asm,10011+20000,323.2856
gfx938,f8_w8a8_channel,torch.bfloat16,17,384,4096,192,8,0,0,asm,10002+20000,344.4867
gfx938,f8_w8a8_channel,torch.bfloat16,18,384,4096,192,8,0,0,asm,10002+20000,361.0931
gfx938,f8_w8a8_channel,torch.bfloat16,19,384,4096,192,8,0,0,asm,10011+20000,370.6004
gfx938,f8_w8a8_channel,torch.bfloat16,20,384,4096,192,8,0,0,asm,10011+20000,374.6257
gfx938,f8_w8a8_channel,torch.bfloat16,21,384,4096,192,8,0,0,asm,10011+20000,379.3905
gfx938,f8_w8a8_channel,torch.bfloat16,22,384,4096,192,8,0,0,asm,10011+20000,396.7562
gfx938,f8_w8a8_channel,torch.bfloat16,23,384,4096,192,8,0,0,asm,10002+20000,420.537
gfx938,f8_w8a8_channel,torch.bfloat16,24,384,4096,192,8,0,0,asm,10002+20000,433.1338
gfx938,f8_w8a8_channel,torch.bfloat16,25,384,4096,192,8,0,0,asm,10008+20000,447.5888
gfx938,f8_w8a8_channel,torch.bfloat16,26,384,4096,192,8,0,0,asm,10011+20000,449.3571
gfx938,f8_w8a8_channel,torch.bfloat16,27,384,4096,192,8,0,0,asm,10011+20000,458.3423
gfx938,f8_w8a8_channel,torch.bfloat16,28,384,4096,192,8,0,0,asm,10011+20000,459.9846
gfx938,f8_w8a8_channel,torch.bfloat16,29,384,4096,192,8,0,0,asm,10011+20000,459.3782
gfx938,f8_w8a8_channel,torch.bfloat16,30,384,4096,192,8,0,0,asm,10011+20000,474.2329
gfx938,f8_w8a8_channel,torch.bfloat16,31,384,4096,192,8,0,0,asm,10011+20000,479.2856
gfx938,f8_w8a8_channel,torch.bfloat16,32,384,4096,192,8,0,0,asm,10002+20000,505.69390000000004
gfx938,f8_w8a8_channel,torch.bfloat16,34,384,4096,192,8,0,0,asm,10008+20000,505.3288
gfx938,f8_w8a8_channel,torch.bfloat16,36,384,4096,192,8,0,0,asm,10011+20000,512.3015
gfx938,f8_w8a8_channel,torch.bfloat16,40,384,4096,192,8,0,0,asm,10011+20000,520.4194
gfx938,f8_w8a8_channel,torch.bfloat16,50,384,4096,192,8,0,0,asm,10002+20000,572.9583
gfx938,f8_w8a8_channel,torch.bfloat16,60,384,4096,192,8,0,0,asm,10011+20000,585.2783
gfx938,f8_w8a8_channel,torch.bfloat16,64,384,4096,192,8,0,0,asm,10011+20000,586.9457
gfx938,f8_w8a8_channel,torch.bfloat16,68,384,4096,192,8,0,0,asm,10011+20000,588.4109
gfx938,f8_w8a8_channel,torch.bfloat16,72,384,4096,192,8,0,0,asm,10011+20000,596.2594
gfx938,f8_w8a8_channel,torch.bfloat16,80,384,4096,192,8,0,0,asm,10011+20000,604.5288
gfx938,f8_w8a8_channel,torch.bfloat16,88,384,4096,192,8,0,0,asm,10011+20000,600.8912
gfx938,f8_w8a8_channel,torch.bfloat16,96,384,4096,192,8,0,0,asm,10011+20000,601.0764
gfx938,f8_w8a8_channel,torch.bfloat16,112,384,4096,192,8,0,0,asm,10011+20000,617.6238
gfx938,f8_w8a8_channel,torch.bfloat16,128,384,4096,192,8,0,0,asm,10013+20000,624.6806
gfx938,f8_w8a8_channel,torch.bfloat16,164,384,4096,192,8,0,0,asm,11007+21000,635.552
gfx938,f8_w8a8_channel,torch.bfloat16,200,384,4096,192,8,0,0,asm,11005+21000,639.9392
gfx938,f8_w8a8_channel,torch.bfloat16,256,384,4096,192,8,0,0,asm,11007+21000,622.8442
gfx938,f8_w8a8_channel,torch.bfloat16,384,384,4096,192,8,0,0,asm,11005+21000,648.8489
gfx938,f8_w8a8_channel,torch.bfloat16,448,384,4096,192,8,0,0,asm,11005+21000,669.3034
gfx938,f8_w8a8_channel,torch.bfloat16,512,384,4096,192,8,0,0,asm,11005+21000,705.5139
gfx938,f8_w8a8_channel,torch.bfloat16,576,384,4096,192,8,0,0,asm,11004+21000,752.3603
gfx938,f8_w8a8_channel,torch.bfloat16,640,384,4096,192,8,0,0,asm,11005+21000,735.7456
gfx938,f8_w8a8_channel,torch.bfloat16,768,384,4096,192,8,0,0,asm,12005+22001,785.4973
gfx938,f8_w8a8_channel,torch.bfloat16,896,384,4096,192,8,0,0,asm,12005+22001,808.0238
gfx938,f8_w8a8_channel,torch.bfloat16,960,384,4096,192,8,0,0,asm,12005+22000,850.3143
gfx938,f8_w8a8_channel,torch.bfloat16,1024,384,4096,192,8,0,0,asm,12005+22001,824.63
gfx938,f8_w8a8_channel,torch.bfloat16,1152,384,4096,192,8,0,0,asm,12005+22001,920.6298
gfx938,f8_w8a8_channel,torch.bfloat16,1280,384,4096,192,8,0,0,asm,12001+22001,900.1413
gfx938,f8_w8a8_channel,torch.bfloat16,1408,384,4096,192,8,0,0,asm,12003+22000,1022.5834
gfx938,f8_w8a8_channel,torch.bfloat16,1536,384,4096,192,8,0,0,asm,12005+22001,1157.3204
gfx938,f8_w8a8_channel,torch.bfloat16,1664,384,4096,192,8,0,0,asm,12001+22001,1223.6704
gfx938,f8_w8a8_channel,torch.bfloat16,1920,384,4096,192,8,0,0,asm,13001+22000,1410.853
gfx938,f8_w8a8_channel,torch.bfloat16,2048,384,4096,192,8,0,0,asm,12002+22000,1616.5035
gfx938,f8_w8a8_channel,torch.bfloat16,2304,384,4096,192,8,0,0,asm,13001+23001,1234.1287
gfx938,f8_w8a8_channel,torch.bfloat16,2560,384,4096,192,8,0,0,asm,13001+23001,1321.0339
gfx938,f8_w8a8_channel,torch.bfloat16,2816,384,4096,192,8,0,0,asm,13001+23001,1391.2062
gfx938,f8_w8a8_channel,torch.bfloat16,3072,384,4096,192,8,0,0,asm,13001+23001,1619.9061
gfx938,f8_w8a8_channel,torch.bfloat16,3328,384,4096,192,8,0,0,asm,12001+22001,1901.8681
gfx938,f8_w8a8_channel,torch.bfloat16,3584,384,4096,192,8,0,0,asm,12001+22001,2119.3495
gfx938,f8_w8a8_channel,torch.bfloat16,3840,384,4096,192,8,0,0,asm,12001+22001,2185.8674
gfx938,f8_w8a8_channel,torch.bfloat16,4096,384,4096,192,8,0,0,asm,13001+23001,2237.3788
gfx938,f8_w8a8_channel,torch.bfloat16,4608,384,4096,192,8,0,0,asm,13001+23001,2310.9957
gfx938,f8_w8a8_channel,torch.bfloat16,5120,384,4096,192,8,0,0,asm,13001+23001,2378.0357
gfx938,f8_w8a8_channel,torch.bfloat16,5632,384,4096,192,8,0,0,asm,13001+23001,2539.4922
gfx938,f8_w8a8_channel,torch.bfloat16,6144,384,4096,192,8,0,0,asm,13001+23001,2885.2772
gfx938,f8_w8a8_channel,torch.bfloat16,6656,384,4096,192,8,0,0,asm,13001+23001,3287.535
gfx938,f8_w8a8_channel,torch.bfloat16,7168,384,4096,192,8,0,0,asm,12001+21001,4397.0517
gfx938,f8_w8a8_channel,torch.bfloat16,7680,384,4096,192,8,0,0,asm,13001+23001,3472.5801
gfx938,f8_w8a8_channel,torch.bfloat16,8192,384,4096,192,8,0,0,asm,12001+22001,4283.1629
gfx938,f8_w8a8_channel,torch.bfloat16,10240,384,4096,192,8,0,0,asm,12003+22000,5482.4985
gfx938,f8_w8a8_channel,torch.bfloat16,12288,384,4096,192,8,0,0,asm,13001+23001,5284.9398
gfx938,f8_w8a8_channel,torch.bfloat16,14336,384,4096,192,8,0,0,asm,13001+23001,5947.1976
gfx938,f8_w8a8_channel,torch.bfloat16,16384,384,4096,192,8,0,0,asm,13001+23001,6931.904
gfx938,f8_w8a8_channel,torch.bfloat16,1,320,6144,160,8,0,0,asm,10008+20000,69.6014
gfx938,f8_w8a8_channel,torch.bfloat16,2,320,6144,160,8,0,0,asm,10011+20000,101.4667
gfx938,f8_w8a8_channel,torch.bfloat16,3,320,6144,160,8,0,0,asm,10011+20001,121.433
gfx938,f8_w8a8_channel,torch.bfloat16,4,320,6144,160,8,0,0,asm,10002+20000,156.473
gfx938,f8_w8a8_channel,torch.bfloat16,5,320,6144,160,8,0,0,asm,10008+20000,176.153
gfx938,f8_w8a8_channel,torch.bfloat16,6,320,6144,160,8,0,0,asm,10012+20000,193.4667
gfx938,f8_w8a8_channel,torch.bfloat16,7,320,6144,160,8,0,0,asm,10011+20000,211.4052
gfx938,f8_w8a8_channel,torch.bfloat16,8,320,6144,160,8,0,0,asm,10002+20000,248.4655
gfx938,f8_w8a8_channel,torch.bfloat16,9,320,6144,160,8,0,0,asm,10002+20000,280.9531
gfx938,f8_w8a8_channel,torch.bfloat16,10,320,6144,160,8,0,0,asm,10011+20000,298.0814
gfx938,f8_w8a8_channel,torch.bfloat16,11,320,6144,160,8,0,0,asm,10002+20000,334.0654
gfx938,f8_w8a8_channel,torch.bfloat16,12,320,6144,160,8,0,0,asm,10008+20000,352.5205
gfx938,f8_w8a8_channel,torch.bfloat16,13,320,6144,160,8,0,0,asm,10008+20000,364.6246
gfx938,f8_w8a8_channel,torch.bfloat16,14,320,6144,160,8,0,0,asm,10011+20000,378.7972
gfx938,f8_w8a8_channel,torch.bfloat16,15,320,6144,160,8,0,0,asm,10011+20000,390.9404
gfx938,f8_w8a8_channel,torch.bfloat16,16,320,6144,160,8,0,0,asm,10008+20000,406.3341
gfx938,f8_w8a8_channel,torch.bfloat16,17,320,6144,160,8,0,0,asm,10002+20000,410.4993
gfx938,f8_w8a8_channel,torch.bfloat16,18,320,6144,160,8,0,0,asm,10002+20000,432.5962
gfx938,f8_w8a8_channel,torch.bfloat16,19,320,6144,160,8,0,0,asm,10002+20000,446.8446
gfx938,f8_w8a8_channel,torch.bfloat16,20,320,6144,160,8,0,0,asm,10002+20000,458.9288
gfx938,f8_w8a8_channel,torch.bfloat16,21,320,6144,160,8,0,0,asm,10012+20000,468.0993
gfx938,f8_w8a8_channel,torch.bfloat16,22,320,6144,160,8,0,0,asm,10012+20000,473.7583
gfx938,f8_w8a8_channel,torch.bfloat16,23,320,6144,160,8,0,0,asm,10011+20000,482.0193
gfx938,f8_w8a8_channel,torch.bfloat16,24,320,6144,160,8,0,0,asm,10011+20000,488.0067
gfx938,f8_w8a8_channel,torch.bfloat16,25,320,6144,160,8,0,0,asm,10011+20000,495.5972
gfx938,f8_w8a8_channel,torch.bfloat16,26,320,6144,160,8,0,0,asm,10012+20000,494.3508
gfx938,f8_w8a8_channel,torch.bfloat16,27,320,6144,160,8,0,0,asm,10002+20000,514.0897
gfx938,f8_w8a8_channel,torch.bfloat16,28,320,6144,160,8,0,0,asm,10002+20000,512.8855
gfx938,f8_w8a8_channel,torch.bfloat16,29,320,6144,160,8,0,0,asm,10002+20000,521.6281
gfx938,f8_w8a8_channel,torch.bfloat16,30,320,6144,160,8,0,0,asm,10002+20000,528.0692
gfx938,f8_w8a8_channel,torch.bfloat16,31,320,6144,160,8,0,0,asm,10002+20000,545.9403
gfx938,f8_w8a8_channel,torch.bfloat16,32,320,6144,160,8,0,0,asm,10002+20000,536.8097
gfx938,f8_w8a8_channel,torch.bfloat16,34,320,6144,160,8,0,0,asm,10002+20000,553.843
gfx938,f8_w8a8_channel,torch.bfloat16,36,320,6144,160,8,0,0,asm,10011+20000,561.0852
gfx938,f8_w8a8_channel,torch.bfloat16,40,320,6144,160,8,0,0,asm,10002+20000,576.1735
gfx938,f8_w8a8_channel,torch.bfloat16,50,320,6144,160,8,0,0,asm,10002+20000,596.8992
gfx938,f8_w8a8_channel,torch.bfloat16,60,320,6144,160,8,0,0,asm,10002+20000,611.6024
gfx938,f8_w8a8_channel,torch.bfloat16,64,320,6144,160,8,0,0,asm,10002+20000,608.2677
gfx938,f8_w8a8_channel,torch.bfloat16,68,320,6144,160,8,0,0,asm,10002+20000,609.0344
gfx938,f8_w8a8_channel,torch.bfloat16,72,320,6144,160,8,0,0,asm,10002+20000,614.6851
gfx938,f8_w8a8_channel,torch.bfloat16,80,320,6144,160,8,0,0,asm,10002+20000,619.4088
gfx938,f8_w8a8_channel,torch.bfloat16,88,320,6144,160,8,0,0,asm,10002+20000,615.9141
gfx938,f8_w8a8_channel,torch.bfloat16,96,320,6144,160,8,0,0,asm,10002+20000,628.9667
gfx938,f8_w8a8_channel,torch.bfloat16,112,320,6144,160,8,0,0,asm,10002+20000,646.8109
gfx938,f8_w8a8_channel,torch.bfloat16,128,320,6144,160,8,0,0,asm,10002+20000,659.7204
gfx938,f8_w8a8_channel,torch.bfloat16,164,320,6144,160,8,0,0,asm,10002+20000,658.2551
gfx938,f8_w8a8_channel,torch.bfloat16,200,320,6144,160,8,0,0,asm,10002+20000,680.8404
gfx938,f8_w8a8_channel,torch.bfloat16,256,320,6144,160,8,0,0,asm,11005+21000,703.8734
gfx938,f8_w8a8_channel,torch.bfloat16,384,320,6144,160,8,0,0,asm,11005+21000,741.0434
gfx938,f8_w8a8_channel,torch.bfloat16,448,320,6144,160,8,0,0,asm,11005+21000,756.2762
gfx938,f8_w8a8_channel,torch.bfloat16,512,320,6144,160,8,0,0,asm,11005+21000,785.7922
gfx938,f8_w8a8_channel,torch.bfloat16,576,320,6144,160,8,0,0,asm,12001+22001,887.2916
gfx938,f8_w8a8_channel,torch.bfloat16,640,320,6144,160,8,0,0,asm,12003+22001,897.8008
gfx938,f8_w8a8_channel,torch.bfloat16,768,320,6144,160,8,0,0,asm,12001+22001,880.7285
gfx938,f8_w8a8_channel,torch.bfloat16,896,320,6144,160,8,0,0,asm,12001+22001,923.4593
gfx938,f8_w8a8_channel,torch.bfloat16,960,320,6144,160,8,0,0,asm,12001+22001,941.1182
gfx938,f8_w8a8_channel,torch.bfloat16,1024,320,6144,160,8,0,0,asm,12001+22001,972.5456
gfx938,f8_w8a8_channel,torch.bfloat16,1152,320,6144,160,8,0,0,asm,12001+22001,1065.0424
gfx938,f8_w8a8_channel,torch.bfloat16,1280,320,6144,160,8,0,0,asm,12003+22001,1285.0508
gfx938,f8_w8a8_channel,torch.bfloat16,1408,320,6144,160,8,0,0,asm,13001+23001,1361.4546
gfx938,f8_w8a8_channel,torch.bfloat16,1536,320,6144,160,8,0,0,asm,13001+23001,1402.4149
gfx938,f8_w8a8_channel,torch.bfloat16,1664,320,6144,160,8,0,0,asm,13001+23001,1414.0279
gfx938,f8_w8a8_channel,torch.bfloat16,1920,320,6144,160,8,0,0,asm,13001+23001,1460.9076
gfx938,f8_w8a8_channel,torch.bfloat16,2048,320,6144,160,8,0,0,asm,13001+23001,1491.1982
gfx938,f8_w8a8_channel,torch.bfloat16,2304,320,6144,160,8,0,0,asm,13001+23001,1656.7476
gfx938,f8_w8a8_channel,torch.bfloat16,2560,320,6144,160,8,0,0,asm,13001+23001,2013.8255
gfx938,f8_w8a8_channel,torch.bfloat16,2816,320,6144,160,8,0,0,asm,12001+22001,2361.5574
gfx938,f8_w8a8_channel,torch.bfloat16,3072,320,6144,160,8,0,0,asm,12001+22001,2514.7185
gfx938,f8_w8a8_channel,torch.bfloat16,3328,320,6144,160,8,0,0,asm,12001+22001,2599.3669
gfx938,f8_w8a8_channel,torch.bfloat16,3584,320,6144,160,8,0,0,asm,12001+22001,2712.2848
gfx938,f8_w8a8_channel,torch.bfloat16,3840,320,6144,160,8,0,0,asm,13001+23001,2762.1038
gfx938,f8_w8a8_channel,torch.bfloat16,4096,320,6144,160,8,0,0,asm,13001+23001,2799.1474
gfx938,f8_w8a8_channel,torch.bfloat16,4608,320,6144,160,8,0,0,asm,13001+23001,2929.9516
gfx938,f8_w8a8_channel,torch.bfloat16,5120,320,6144,160,8,0,0,asm,13001+23001,3464.4512
gfx938,f8_w8a8_channel,torch.bfloat16,5632,320,6144,160,8,0,0,asm,13001+23001,3967.3241
gfx938,f8_w8a8_channel,torch.bfloat16,6144,320,6144,160,8,0,0,asm,13001+23001,4164.1323
gfx938,f8_w8a8_channel,torch.bfloat16,6656,320,6144,160,8,0,0,asm,13001+23001,4277.5134
gfx938,f8_w8a8_channel,torch.bfloat16,7168,320,6144,160,8,0,0,asm,13001+23001,4424.5111
gfx938,f8_w8a8_channel,torch.bfloat16,7680,320,6144,160,8,0,0,asm,13001+23001,4918.0516
gfx938,f8_w8a8_channel,torch.bfloat16,8192,320,6144,160,8,0,0,asm,13001+23001,5357.5801
gfx938,f8_w8a8_channel,torch.bfloat16,10240,320,6144,160,8,0,0,asm,13001+23001,6304.1441
gfx938,f8_w8a8_channel,torch.bfloat16,12288,320,6144,160,8,0,0,asm,13001+23001,7287.5066
gfx938,f8_w8a8_channel,torch.bfloat16,14336,320,6144,160,8,0,0,asm,13001+23001,8555.5617
gfx938,f8_w8a8_channel,torch.bfloat16,16384,320,6144,160,8,0,0,asm,13001+23001,9789.7686
gfx938,f8_w8a8_channel,torch.bfloat16,1,512,4096,256,8,0,0,asm,10010+20000,75.8919
gfx938,f8_w8a8_channel,torch.bfloat16,2,512,4096,256,8,0,0,asm,10011+20000,110.873
gfx938,f8_w8a8_channel,torch.bfloat16,3,512,4096,256,8,0,0,asm,10011+20000,156.1189
gfx938,f8_w8a8_channel,torch.bfloat16,4,512,4096,256,8,0,0,asm,10013+20000,187.1673
gfx938,f8_w8a8_channel,torch.bfloat16,5,512,4096,256,8,0,0,asm,10011+20000,234.5187
gfx938,f8_w8a8_channel,torch.bfloat16,6,512,4096,256,8,0,0,asm,10013+20000,263.8997
gfx938,f8_w8a8_channel,torch.bfloat16,7,512,4096,256,8,0,0,asm,10011+20000,314.1481
gfx938,f8_w8a8_channel,torch.bfloat16,8,512,4096,256,8,0,0,asm,10011+20000,328.3713
gfx938,f8_w8a8_channel,torch.bfloat16,9,512,4096,256,8,0,0,asm,10013+20000,356.5142
gfx938,f8_w8a8_channel,torch.bfloat16,10,512,4096,256,8,0,0,asm,10002+20000,401.9035
gfx938,f8_w8a8_channel,torch.bfloat16,11,512,4096,256,8,0,0,asm,10013+20000,424.792
gfx938,f8_w8a8_channel,torch.bfloat16,12,512,4096,256,8,0,0,asm,10013+20000,437.2972
gfx938,f8_w8a8_channel,torch.bfloat16,13,512,4096,256,8,0,0,asm,10002+20000,477.3813
gfx938,f8_w8a8_channel,torch.bfloat16,14,512,4096,256,8,0,0,asm,10013+20000,504.0507
gfx938,f8_w8a8_channel,torch.bfloat16,15,512,4096,256,8,0,0,asm,10013+20000,519.0485
gfx938,f8_w8a8_channel,torch.bfloat16,16,512,4096,256,8,0,0,asm,10002+20000,545.061
gfx938,f8_w8a8_channel,torch.bfloat16,17,512,4096,256,8,0,0,asm,11007+20000,559.3569
gfx938,f8_w8a8_channel,torch.bfloat16,18,512,4096,256,8,0,0,asm,11005+20000,574.4559
gfx938,f8_w8a8_channel,torch.bfloat16,20,512,4096,256,8,0,0,asm,10002+20000,634.363
gfx938,f8_w8a8_channel,torch.bfloat16,24,512,4096,256,8,0,0,asm,11007+20000,722.2786
gfx938,f8_w8a8_channel,torch.bfloat16,28,512,4096,256,8,0,0,asm,10013+20000,819.4688
gfx938,f8_w8a8_channel,torch.bfloat16,32,512,4096,256,8,0,0,asm,10013+20000,883.6372
gfx938,f8_w8a8_channel,torch.bfloat16,34,512,4096,256,8,0,0,asm,10013+20000,885.7056
gfx938,f8_w8a8_channel,torch.bfloat16,36,512,4096,256,8,0,0,asm,11004+20000,925.9328
gfx938,f8_w8a8_channel,torch.bfloat16,40,512,4096,256,8,0,0,asm,10013+20000,959.1117
gfx938,f8_w8a8_channel,torch.bfloat16,44,512,4096,256,8,0,0,asm,10013+20000,1014.1347
gfx938,f8_w8a8_channel,torch.bfloat16,48,512,4096,256,8,0,0,asm,10013+20000,1031.8273
gfx938,f8_w8a8_channel,torch.bfloat16,56,512,4096,256,8,0,0,asm,10013+20000,1095.1198
gfx938,f8_w8a8_channel,torch.bfloat16,64,512,4096,256,8,0,0,asm,10013+20000,1125.0987
gfx938,f8_w8a8_channel,torch.bfloat16,68,512,4096,256,8,0,0,asm,10013+20000,1132.2397
gfx938,f8_w8a8_channel,torch.bfloat16,72,512,4096,256,8,0,0,asm,10013+20000,1199.5153
gfx938,f8_w8a8_channel,torch.bfloat16,80,512,4096,256,8,0,0,asm,10013+20000,1204.4331
gfx938,f8_w8a8_channel,torch.bfloat16,88,512,4096,256,8,0,0,asm,10013+20000,1230.0583
gfx938,f8_w8a8_channel,torch.bfloat16,96,512,4096,256,8,0,0,asm,10013+20000,1262.8582
gfx938,f8_w8a8_channel,torch.bfloat16,104,512,4096,256,8,0,0,asm,10013+20000,1270.4204
gfx938,f8_w8a8_channel,torch.bfloat16,112,512,4096,256,8,0,0,asm,10013+20000,1282.7065
gfx938,f8_w8a8_channel,torch.bfloat16,128,512,4096,256,8,0,0,asm,10013+20000,1305.9148
gfx938,f8_w8a8_channel,torch.bfloat16,144,512,4096,256,8,0,0,asm,10013+20000,1327.5401
gfx938,f8_w8a8_channel,torch.bfloat16,160,512,4096,256,8,0,0,asm,10013+20000,1372.9126
gfx938,f8_w8a8_channel,torch.bfloat16,192,512,4096,256,8,0,0,asm,10013+20000,1337.5275
gfx938,f8_w8a8_channel,torch.bfloat16,224,512,4096,256,8,0,0,asm,10013+20000,1346.6138
gfx938,f8_w8a8_channel,torch.bfloat16,256,512,4096,256,8,0,0,asm,10013+20000,1360.3402
gfx938,f8_w8a8_channel,torch.bfloat16,320,512,4096,256,8,0,0,asm,12005+22000,1397.5359
gfx938,f8_w8a8_channel,torch.bfloat16,384,512,4096,256,8,0,0,asm,10012+20000,1409.5781
gfx938,f8_w8a8_channel,torch.bfloat16,448,512,4096,256,8,0,0,asm,12005+22000,1431.1863
gfx938,f8_w8a8_channel,torch.bfloat16,512,512,4096,256,8,0,0,asm,12005+22000,1453.0052
gfx938,f8_w8a8_channel,torch.bfloat16,576,512,4096,256,8,0,0,asm,11004+21000,1552.9712
gfx938,f8_w8a8_channel,torch.bfloat16,640,512,4096,256,8,0,0,asm,12005+22000,1488.6682
gfx938,f8_w8a8_channel,torch.bfloat16,704,512,4096,256,8,0,0,asm,12001+22000,1506.5207
gfx938,f8_w8a8_channel,torch.bfloat16,768,512,4096,256,8,0,0,asm,12005+22000,1524.1374
gfx938,f8_w8a8_channel,torch.bfloat16,832,512,4096,256,8,0,0,asm,12001+22000,1553.2487
gfx938,f8_w8a8_channel,torch.bfloat16,896,512,4096,256,8,0,0,asm,12005+22000,1573.4087
gfx938,f8_w8a8_channel,torch.bfloat16,960,512,4096,256,8,0,0,asm,12001+22000,1577.7371
gfx938,f8_w8a8_channel,torch.bfloat16,1024,512,4096,256,8,0,0,asm,12001+22000,1596.3645
gfx938,f8_w8a8_channel,torch.bfloat16,1152,512,4096,256,8,0,0,asm,12005+22000,1656.3139
gfx938,f8_w8a8_channel,torch.bfloat16,1280,512,4096,256,8,0,0,asm,12001+22000,1659.0086
gfx938,f8_w8a8_channel,torch.bfloat16,1408,512,4096,256,8,0,0,asm,12001+22000,1694.2758
gfx938,f8_w8a8_channel,torch.bfloat16,1536,512,4096,256,8,0,0,asm,12001+22000,1747.2695
gfx938,f8_w8a8_channel,torch.bfloat16,1664,512,4096,256,8,0,0,asm,12001+22000,1764.5495
gfx938,f8_w8a8_channel,torch.bfloat16,1792,512,4096,256,8,0,0,asm,12001+22000,1814.0988
gfx938,f8_w8a8_channel,torch.bfloat16,1920,512,4096,256,8,0,0,asm,12001+22000,1863.4713
gfx938,f8_w8a8_channel,torch.bfloat16,2048,512,4096,256,8,0,0,asm,12001+22000,2004.3972
gfx938,f8_w8a8_channel,torch.bfloat16,2304,512,4096,256,8,0,0,asm,13001+23001,2068.1275
gfx938,f8_w8a8_channel,torch.bfloat16,2560,512,4096,256,8,0,0,asm,13001+23001,2104.7673
gfx938,f8_w8a8_channel,torch.bfloat16,2816,512,4096,256,8,0,0,asm,13001+23001,2164.767
gfx938,f8_w8a8_channel,torch.bfloat16,3072,512,4096,256,8,0,0,asm,13001+23001,2232.9101
gfx938,f8_w8a8_channel,torch.bfloat16,3328,512,4096,256,8,0,0,asm,13001+23001,2291.0403
gfx938,f8_w8a8_channel,torch.bfloat16,3584,512,4096,256,8,0,0,asm,13001+23001,2342.3748
gfx938,f8_w8a8_channel,torch.bfloat16,3840,512,4096,256,8,0,0,asm,13001+23001,2525.4564
gfx938,f8_w8a8_channel,torch.bfloat16,4096,512,4096,256,8,0,0,asm,13001+23001,2913.6658
gfx938,f8_w8a8_channel,torch.bfloat16,4608,512,4096,256,8,0,0,asm,12001+22001,3532.8052
gfx938,f8_w8a8_channel,torch.bfloat16,5120,512,4096,256,8,0,0,asm,12001+22001,3707.7436
gfx938,f8_w8a8_channel,torch.bfloat16,5632,512,4096,256,8,0,0,asm,12001+22001,3903.2379
gfx938,f8_w8a8_channel,torch.bfloat16,6144,512,4096,256,8,0,0,asm,13001+23001,4042.7495
gfx938,f8_w8a8_channel,torch.bfloat16,6656,512,4096,256,8,0,0,asm,13001+23001,4125.4021
gfx938,f8_w8a8_channel,torch.bfloat16,7168,512,4096,256,8,0,0,asm,13001+23001,4192.7535
gfx938,f8_w8a8_channel,torch.bfloat16,7680,512,4096,256,8,0,0,asm,13001+23001,4438.7736
gfx938,f8_w8a8_channel,torch.bfloat16,8192,512,4096,256,8,0,0,asm,13001+23001,4982.7885
gfx938,f8_w8a8_channel,torch.bfloat16,10240,512,4096,256,8,0,0,asm,13001+23001,5955.813
gfx938,f8_w8a8_channel,torch.bfloat16,12288,512,4096,256,8,0,0,asm,13001+23001,6843.4231
gfx938,f8_w8a8_channel,torch.bfloat16,14336,512,4096,256,8,0,0,asm,13001+23001,8050.5179
gfx938,f8_w8a8_channel,torch.bfloat16,16384,512,4096,256,8,0,0,asm,13001+23001,8920.3837
gfx938,f8_w8a8_channel,torch.bfloat16,17408,512,4096,256,8,0,0,asm,13001+23001,9615.6993
gfx938,f8_w8a8_channel,torch.bfloat16,24576,512,4096,256,8,0,0,asm,13001+23001,12861.3819
gfx938,f8_w8a8_channel,torch.bfloat16,32768,512,4096,256,8,0,0,asm,13001+23001,16857.1581
gfx938,f8_w8a8_channel,torch.bfloat16,40960,512,4096,256,8,0,0,asm,13001+23001,20774.7386
gfx938,f8_w8a8_channel,torch.bfloat16,49152,512,4096,256,8,0,0,asm,13001+23001,24836.1122
gfx938,f8_w8a8_channel,torch.bfloat16,57344,512,4096,256,8,0,0,asm,13001+23001,28813.0985
gfx938,f8_w8a8_channel,torch.bfloat16,65536,512,4096,256,8,0,0,asm,13001+23001,32831.1841
gfx938,f8_w8a8_channel,torch.bfloat16,1,256,4096,256,8,0,0,asm,10002+20000,56.3044
gfx938,f8_w8a8_channel,torch.bfloat16,2,256,4096,256,8,0,0,asm,10009+20000,77.8791
gfx938,f8_w8a8_channel,torch.bfloat16,3,256,4096,256,8,0,0,asm,10011+20000,95.4364
gfx938,f8_w8a8_channel,torch.bfloat16,4,256,4096,256,8,0,0,asm,10013+20000,114.7963
gfx938,f8_w8a8_channel,torch.bfloat16,5,256,4096,256,8,0,0,asm,10002+20001,141.6684
gfx938,f8_w8a8_channel,torch.bfloat16,6,256,4096,256,8,0,0,asm,10011+20000,157.0874
gfx938,f8_w8a8_channel,torch.bfloat16,7,256,4096,256,8,0,0,asm,10011+20000,166.4009
gfx938,f8_w8a8_channel,torch.bfloat16,8,256,4096,256,8,0,0,asm,10011+20000,181.5504
gfx938,f8_w8a8_channel,torch.bfloat16,9,256,4096,256,8,0,0,asm,10011+20000,195.1926
gfx938,f8_w8a8_channel,torch.bfloat16,10,256,4096,256,8,0,0,asm,10002+20000,227.9334
gfx938,f8_w8a8_channel,torch.bfloat16,11,256,4096,256,8,0,0,asm,10011+20000,237.0449
gfx938,f8_w8a8_channel,torch.bfloat16,12,256,4096,256,8,0,0,asm,10011+20000,243.7987
gfx938,f8_w8a8_channel,torch.bfloat16,13,256,4096,256,8,0,0,asm,10012+20000,254.5859
gfx938,f8_w8a8_channel,torch.bfloat16,14,256,4096,256,8,0,0,asm,10011+20000,268.8848
gfx938,f8_w8a8_channel,torch.bfloat16,15,256,4096,256,8,0,0,asm,10013+20000,274.527
gfx938,f8_w8a8_channel,torch.bfloat16,16,256,4096,256,8,0,0,asm,10002+20000,306.2069
gfx938,f8_w8a8_channel,torch.bfloat16,17,256,4096,256,8,0,0,asm,10011+20000,306.8776
gfx938,f8_w8a8_channel,torch.bfloat16,18,256,4096,256,8,0,0,asm,10011+20000,313.0418
gfx938,f8_w8a8_channel,torch.bfloat16,20,256,4096,256,8,0,0,asm,10011+20000,326.0776
gfx938,f8_w8a8_channel,torch.bfloat16,24,256,4096,256,8,0,0,asm,10011+20000,384.5867
gfx938,f8_w8a8_channel,torch.bfloat16,28,256,4096,256,8,0,0,asm,10013+20000,428.9265
gfx938,f8_w8a8_channel,torch.bfloat16,32,256,4096,256,8,0,0,asm,10011+20000,476.4969
gfx938,f8_w8a8_channel,torch.bfloat16,34,256,4096,256,8,0,0,asm,10013+20000,468.9065
gfx938,f8_w8a8_channel,torch.bfloat16,36,256,4096,256,8,0,0,asm,10011+20000,474.2539
gfx938,f8_w8a8_channel,torch.bfloat16,40,256,4096,256,8,0,0,asm,10013+20000,487.9128
gfx938,f8_w8a8_channel,torch.bfloat16,44,256,4096,256,8,0,0,asm,10013+20000,531.6433
gfx938,f8_w8a8_channel,torch.bfloat16,48,256,4096,256,8,0,0,asm,10013+20000,538.1696
gfx938,f8_w8a8_channel,torch.bfloat16,56,256,4096,256,8,0,0,asm,10013+20000,550.0937
gfx938,f8_w8a8_channel,torch.bfloat16,64,256,4096,256,8,0,0,asm,10013+20000,564.0726
gfx938,f8_w8a8_channel,torch.bfloat16,68,256,4096,256,8,0,0,asm,10013+20000,570.9105
gfx938,f8_w8a8_channel,torch.bfloat16,72,256,4096,256,8,0,0,asm,10002+20000,638.5988
gfx938,f8_w8a8_channel,torch.bfloat16,80,256,4096,256,8,0,0,asm,10011+20000,623.4157
gfx938,f8_w8a8_channel,torch.bfloat16,88,256,4096,256,8,0,0,asm,10013+20000,628.4768
gfx938,f8_w8a8_channel,torch.bfloat16,96,256,4096,256,8,0,0,asm,10013+20000,633.9588
gfx938,f8_w8a8_channel,torch.bfloat16,104,256,4096,256,8,0,0,asm,10011+20000,640.2747
gfx938,f8_w8a8_channel,torch.bfloat16,112,256,4096,256,8,0,0,asm,10013+20000,647.8451
gfx938,f8_w8a8_channel,torch.bfloat16,128,256,4096,256,8,0,0,asm,10013+20000,660.0304
gfx938,f8_w8a8_channel,torch.bfloat16,144,256,4096,256,8,0,0,asm,11007+21000,667.3651
gfx938,f8_w8a8_channel,torch.bfloat16,160,256,4096,256,8,0,0,asm,10013+20000,720.207
gfx938,f8_w8a8_channel,torch.bfloat16,192,256,4096,256,8,0,0,asm,11007+21000,684.8472
gfx938,f8_w8a8_channel,torch.bfloat16,224,256,4096,256,8,0,0,asm,11007+21000,692.9482
gfx938,f8_w8a8_channel,torch.bfloat16,256,256,4096,256,8,0,0,asm,11007+21000,701.0576
gfx938,f8_w8a8_channel,torch.bfloat16,320,256,4096,256,8,0,0,asm,11007+21000,714.9185
gfx938,f8_w8a8_channel,torch.bfloat16,384,256,4096,256,8,0,0,asm,11007+21000,730.2954
gfx938,f8_w8a8_channel,torch.bfloat16,448,256,4096,256,8,0,0,asm,11007+21000,749.7058
gfx938,f8_w8a8_channel,torch.bfloat16,512,256,4096,256,8,0,0,asm,11007+21000,767.71
gfx938,f8_w8a8_channel,torch.bfloat16,576,256,4096,256,8,0,0,asm,11007+21000,911.0612
gfx938,f8_w8a8_channel,torch.bfloat16,640,256,4096,256,8,0,0,asm,11007+21000,827.5752
gfx938,f8_w8a8_channel,torch.bfloat16,704,256,4096,256,8,0,0,asm,11007+21000,841.3687
gfx938,f8_w8a8_channel,torch.bfloat16,768,256,4096,256,8,0,0,asm,11007+21000,851.6931
gfx938,f8_w8a8_channel,torch.bfloat16,832,256,4096,256,8,0,0,asm,11007+21000,854.2361
gfx938,f8_w8a8_channel,torch.bfloat16,896,256,4096,256,8,0,0,asm,11007+21000,895.1287
gfx938,f8_w8a8_channel,torch.bfloat16,960,256,4096,256,8,0,0,asm,12005+22001,913.5792
gfx938,f8_w8a8_channel,torch.bfloat16,1024,256,4096,256,8,0,0,asm,12005+22001,928.2065
gfx938,f8_w8a8_channel,torch.bfloat16,1152,256,4096,256,8,0,0,asm,12005+22001,1030.5556
gfx938,f8_w8a8_channel,torch.bfloat16,1280,256,4096,256,8,0,0,asm,12005+22001,974.0253
gfx938,f8_w8a8_channel,torch.bfloat16,1408,256,4096,256,8,0,0,asm,12005+22001,992.4589
gfx938,f8_w8a8_channel,torch.bfloat16,1536,256,4096,256,8,0,0,asm,12005+22001,1049.3093
gfx938,f8_w8a8_channel,torch.bfloat16,1664,256,4096,256,8,0,0,asm,12005+22001,1075.9619
gfx938,f8_w8a8_channel,torch.bfloat16,1792,256,4096,256,8,0,0,asm,12005+22001,1085.7218
gfx938,f8_w8a8_channel,torch.bfloat16,1920,256,4096,256,8,0,0,asm,12005+22001,1161.7554
gfx938,f8_w8a8_channel,torch.bfloat16,2048,256,4096,256,8,0,0,asm,12001+22001,1223.5404
gfx938,f8_w8a8_channel,torch.bfloat16,2304,256,4096,256,8,0,0,asm,13001+23001,1368.9801
gfx938,f8_w8a8_channel,torch.bfloat16,2560,256,4096,256,8,0,0,asm,13001+23001,1383.0179
gfx938,f8_w8a8_channel,torch.bfloat16,2816,256,4096,256,8,0,0,asm,13001+23001,1446.8324
gfx938,f8_w8a8_channel,torch.bfloat16,3072,256,4096,256,8,0,0,asm,13001+23001,1506.8069
gfx938,f8_w8a8_channel,torch.bfloat16,3328,256,4096,256,8,0,0,asm,13001+23001,1539.3036
gfx938,f8_w8a8_channel,torch.bfloat16,3584,256,4096,256,8,0,0,asm,13001+23001,1599.775
gfx938,f8_w8a8_channel,torch.bfloat16,3840,256,4096,256,8,0,0,asm,13001+23001,1742.5619
gfx938,f8_w8a8_channel,torch.bfloat16,4096,256,4096,256,8,0,0,asm,13001+23001,1961.475
gfx938,f8_w8a8_channel,torch.bfloat16,4608,256,4096,256,8,0,0,asm,12001+22001,2326.1309
gfx938,f8_w8a8_channel,torch.bfloat16,5120,256,4096,256,8,0,0,asm,12001+22001,2440.0339
gfx938,f8_w8a8_channel,torch.bfloat16,5632,256,4096,256,8,0,0,asm,12001+22001,2579.3011
gfx938,f8_w8a8_channel,torch.bfloat16,6144,256,4096,256,8,0,0,asm,13001+23001,2715.183
gfx938,f8_w8a8_channel,torch.bfloat16,6656,256,4096,256,8,0,0,asm,13001+23001,2793.6165
gfx938,f8_w8a8_channel,torch.bfloat16,7168,256,4096,256,8,0,0,asm,13001+23001,2894.8036
gfx938,f8_w8a8_channel,torch.bfloat16,7680,256,4096,256,8,0,0,asm,13001+23001,3076.6136
gfx938,f8_w8a8_channel,torch.bfloat16,8192,256,4096,256,8,0,0,asm,13001+23001,3431.2145
gfx938,f8_w8a8_channel,torch.bfloat16,10240,256,4096,256,8,0,0,asm,13001+23001,4102.8755
gfx938,f8_w8a8_channel,torch.bfloat16,12288,256,4096,256,8,0,0,asm,13001+23001,4808.2627
gfx938,f8_w8a8_channel,torch.bfloat16,14336,256,4096,256,8,0,0,asm,13001+23001,5609.1194
gfx938,f8_w8a8_channel,torch.bfloat16,16384,256,4096,256,8,0,0,asm,13001+23001,6265.4711
gfx938,f8_w8a8_channel,torch.bfloat16,17408,256,4096,256,8,0,0,asm,13001+23001,6744.4082
gfx938,f8_w8a8_channel,torch.bfloat16,24576,256,4096,256,8,0,0,asm,13001+23001,9062.0679
gfx938,f8_w8a8_channel,torch.bfloat16,32768,256,4096,256,8,0,0,asm,13001+23001,11895.1374
gfx938,f8_w8a8_channel,torch.bfloat16,40960,256,4096,256,8,0,0,asm,13001+23001,14666.4642
gfx938,f8_w8a8_channel,torch.bfloat16,49152,256,4096,256,8,0,0,asm,13001+23001,17470.387
gfx938,f8_w8a8_channel,torch.bfloat16,57344,256,4096,256,8,0,0,asm,13001+23001,20309.6104
gfx938,f8_w8a8_channel,torch.bfloat16,65536,256,4096,256,8,0,0,asm,13001+23001,23151.9759
gfx938,int8_w8a8_channel,torch.float16,1,512,2048,256,8,0,0,asm,10010+20000,57.5423
gfx938,int8_w8a8_channel,torch.float16,2,512,2048,256,8,0,0,asm,10013+20000,75.8832
gfx938,int8_w8a8_channel,torch.float16,3,512,2048,256,8,0,0,asm,10013+20000,103.3274
gfx938,int8_w8a8_channel,torch.float16,4,512,2048,256,8,0,0,asm,10013+20000,118.0053
gfx938,int8_w8a8_channel,torch.float16,5,512,2048,256,8,0,0,asm,10013+20000,140.1358
gfx938,int8_w8a8_channel,torch.float16,6,512,2048,256,8,0,0,asm,10013+20000,153.1881
gfx938,int8_w8a8_channel,torch.float16,7,512,2048,256,8,0,0,asm,10013+20000,181.3652
gfx938,int8_w8a8_channel,torch.float16,8,512,2048,256,8,0,0,asm,10013+20000,192.2788
gfx938,int8_w8a8_channel,torch.float16,9,512,2048,256,8,0,0,asm,10013+20000,208.2451
gfx938,int8_w8a8_channel,torch.float16,10,512,2048,256,8,0,0,asm,10008+20000,228.9861
gfx938,int8_w8a8_channel,torch.float16,11,512,2048,256,8,0,0,asm,10013+20000,240.826
gfx938,int8_w8a8_channel,torch.float16,12,512,2048,256,8,0,0,asm,10013+20000,251.5291
gfx938,int8_w8a8_channel,torch.float16,13,512,2048,256,8,0,0,asm,10011+20000,273.2555
gfx938,int8_w8a8_channel,torch.float16,14,512,2048,256,8,0,0,asm,10013+20000,285.9122
gfx938,int8_w8a8_channel,torch.float16,15,512,2048,256,8,0,0,asm,10013+20000,297.1289
gfx938,int8_w8a8_channel,torch.float16,16,512,2048,256,8,0,0,asm,10008+20000,309.1964
gfx938,int8_w8a8_channel,torch.float16,17,512,2048,256,8,0,0,asm,10009+20000,312.5196
gfx938,int8_w8a8_channel,torch.float16,18,512,2048,256,8,0,0,asm,10013+20000,320.166
gfx938,int8_w8a8_channel,torch.float16,20,512,2048,256,8,0,0,asm,10008+20000,349.8501
gfx938,int8_w8a8_channel,torch.float16,24,512,2048,256,8,0,0,asm,10009+20000,390.6669
gfx938,int8_w8a8_channel,torch.float16,28,512,2048,256,8,0,0,asm,10013+20000,454.9896
gfx938,int8_w8a8_channel,torch.float16,32,512,2048,256,8,0,0,asm,11005+20000,482.9222
gfx938,int8_w8a8_channel,torch.float16,34,512,2048,256,8,0,0,asm,11005+20000,468.7803
gfx938,int8_w8a8_channel,torch.float16,36,512,2048,256,8,0,0,asm,10011+20000,496.3086
gfx938,int8_w8a8_channel,torch.float16,40,512,2048,256,8,0,0,asm,10011+20000,516.0896
gfx938,int8_w8a8_channel,torch.float16,44,512,2048,256,8,0,0,asm,10011+20000,536.3675
gfx938,int8_w8a8_channel,torch.float16,48,512,2048,256,8,0,0,asm,10008+20000,550.9443
gfx938,int8_w8a8_channel,torch.float16,56,512,2048,256,8,0,0,asm,10011+20000,577.7821
gfx938,int8_w8a8_channel,torch.float16,64,512,2048,256,8,0,0,asm,10008+20000,597.9084
gfx938,int8_w8a8_channel,torch.float16,68,512,2048,256,8,0,0,asm,10008+20000,601.681
gfx938,int8_w8a8_channel,torch.float16,72,512,2048,256,8,0,0,asm,10008+20000,615.7609
gfx938,int8_w8a8_channel,torch.float16,80,512,2048,256,8,0,0,asm,10011+20000,636.763
gfx938,int8_w8a8_channel,torch.float16,88,512,2048,256,8,0,0,asm,10011+20000,650.9019
gfx938,int8_w8a8_channel,torch.float16,96,512,2048,256,8,0,0,asm,10011+20000,655.6177
gfx938,int8_w8a8_channel,torch.float16,104,512,2048,256,8,0,0,asm,10011+20000,665.5714
gfx938,int8_w8a8_channel,torch.float16,112,512,2048,256,8,0,0,asm,10011+20000,670.8683
gfx938,int8_w8a8_channel,torch.float16,128,512,2048,256,8,0,0,asm,10011+20000,685.1923
gfx938,int8_w8a8_channel,torch.float16,144,512,2048,256,8,0,0,asm,10008+20001,725.1081
gfx938,int8_w8a8_channel,torch.float16,160,512,2048,256,8,0,0,asm,10011+20000,696.4429
gfx938,int8_w8a8_channel,torch.float16,192,512,2048,256,8,0,0,asm,10011+20000,700.8724
gfx938,int8_w8a8_channel,torch.float16,224,512,2048,256,8,0,0,asm,10011+20000,711.4071
gfx938,int8_w8a8_channel,torch.float16,256,512,2048,256,8,0,0,asm,10011+20000,716.0387
gfx938,int8_w8a8_channel,torch.float16,320,512,2048,256,8,0,0,asm,11007+21000,774.5733
gfx938,int8_w8a8_channel,torch.float16,384,512,2048,256,8,0,0,asm,10011+20000,747.0113
gfx938,int8_w8a8_channel,torch.float16,448,512,2048,256,8,0,0,asm,11005+21000,773.7396
gfx938,int8_w8a8_channel,torch.float16,512,512,2048,256,8,0,0,asm,11005+21000,787.6848
gfx938,int8_w8a8_channel,torch.float16,576,512,2048,256,8,0,0,asm,12005+22000,861.8404
gfx938,int8_w8a8_channel,torch.float16,640,512,2048,256,8,0,0,asm,11005+21000,809.0491
gfx938,int8_w8a8_channel,torch.float16,704,512,2048,256,8,0,0,asm,11007+21000,826.4215
gfx938,int8_w8a8_channel,torch.float16,768,512,2048,256,8,0,0,asm,11005+21000,838.3794
gfx938,int8_w8a8_channel,torch.float16,832,512,2048,256,8,0,0,asm,11005+21000,849.7562
gfx938,int8_w8a8_channel,torch.float16,896,512,2048,256,8,0,0,asm,12005+22000,884.7455
gfx938,int8_w8a8_channel,torch.float16,960,512,2048,256,8,0,0,asm,12001+22000,867.4824
gfx938,int8_w8a8_channel,torch.float16,1024,512,2048,256,8,0,0,asm,12005+22000,888.8297
gfx938,int8_w8a8_channel,torch.float16,1152,512,2048,256,8,0,0,asm,12005+22000,941.4443
gfx938,int8_w8a8_channel,torch.float16,1280,512,2048,256,8,0,0,asm,12001+22000,928.9812
gfx938,int8_w8a8_channel,torch.float16,1408,512,2048,256,8,0,0,asm,12001+22000,947.9622
gfx938,int8_w8a8_channel,torch.float16,1536,512,2048,256,8,0,0,asm,12005+22000,980.9053
gfx938,int8_w8a8_channel,torch.float16,1664,512,2048,256,8,0,0,asm,12001+22000,999.9199
gfx938,int8_w8a8_channel,torch.float16,1792,512,2048,256,8,0,0,asm,12001+22000,1022.5051
gfx938,int8_w8a8_channel,torch.float16,1920,512,2048,256,8,0,0,asm,12001+22000,1070.6567
gfx938,int8_w8a8_channel,torch.float16,2048,512,2048,256,8,0,0,asm,12001+22001,1153.3089
gfx938,int8_w8a8_channel,torch.float16,2304,512,2048,256,8,0,0,asm,13001+23001,1182.4963
gfx938,int8_w8a8_channel,torch.float16,2560,512,2048,256,8,0,0,asm,13001+23001,1224.7614
gfx938,int8_w8a8_channel,torch.float16,2816,512,2048,256,8,0,0,asm,13001+23001,1245.3172
gfx938,int8_w8a8_channel,torch.float16,3072,512,2048,256,8,0,0,asm,13001+23001,1278.1086
gfx938,int8_w8a8_channel,torch.float16,3328,512,2048,256,8,0,0,asm,13001+23001,1336.2475
gfx938,int8_w8a8_channel,torch.float16,3584,512,2048,256,8,0,0,asm,13001+23001,1381.6115
gfx938,int8_w8a8_channel,torch.float16,3840,512,2048,256,8,0,0,asm,13001+23001,1493.1565
gfx938,int8_w8a8_channel,torch.float16,4096,512,2048,256,8,0,0,asm,13001+23001,1657.3499
gfx938,int8_w8a8_channel,torch.float16,4608,512,2048,256,8,0,0,asm,13001+23001,1993.45
gfx938,int8_w8a8_channel,torch.float16,5120,512,2048,256,8,0,0,asm,12001+22001,2101.3907
gfx938,int8_w8a8_channel,torch.float16,5632,512,2048,256,8,0,0,asm,13001+23001,2184.7083
gfx938,int8_w8a8_channel,torch.float16,6144,512,2048,256,8,0,0,asm,13001+23001,2237.3144
gfx938,int8_w8a8_channel,torch.float16,6656,512,2048,256,8,0,0,asm,13001+23001,2297.6341
gfx938,int8_w8a8_channel,torch.float16,7168,512,2048,256,8,0,0,asm,13001+23001,2372.7578
gfx938,int8_w8a8_channel,torch.float16,7680,512,2048,256,8,0,0,asm,13001+23001,2503.2248
gfx938,int8_w8a8_channel,torch.float16,8192,512,2048,256,8,0,0,asm,13001+23001,2796.6807
gfx938,int8_w8a8_channel,torch.float16,10240,512,2048,256,8,0,0,asm,13001+23001,3349.3024
gfx938,int8_w8a8_channel,torch.float16,12288,512,2048,256,8,0,0,asm,13001+23001,3896.5094
gfx938,int8_w8a8_channel,torch.float16,14336,512,2048,256,8,0,0,asm,13001+23001,4486.15
gfx938,int8_w8a8_channel,torch.float16,16384,512,2048,256,8,0,0,asm,13001+23001,5067.0073
gfx938,int8_w8a8_channel,torch.float16,17408,512,2048,256,8,0,0,asm,13001+23001,5469.187
gfx938,int8_w8a8_channel,torch.float16,24576,512,2048,256,8,0,0,asm,13001+23001,7372.5919
gfx938,int8_w8a8_channel,torch.float16,32768,512,2048,256,8,0,0,asm,13001+23001,11965.4224
gfx938,int8_w8a8_channel,torch.float16,40960,512,2048,256,8,0,0,asm,13001+23001,14939.3742
gfx938,int8_w8a8_channel,torch.float16,49152,512,2048,256,8,0,0,asm,13001+23001,17880.1629
gfx938,int8_w8a8_channel,torch.float16,57344,512,2048,256,8,0,0,asm,13001+23001,20746.1576
gfx938,int8_w8a8_channel,torch.float16,65536,512,2048,256,8,0,0,asm,13001+22001,24014.33
gfx938,int8_w8a8_channel,torch.float16,65536,512,2048,256,8,0,0,asm,13001+23001,23307.1407
gfx938,int8_w8a8_channel,torch.bfloat16,1,512,2048,256,8,0,0,asm,10010+20000,66.1234
gfx938,int8_w8a8_channel,torch.bfloat16,2,512,2048,256,8,0,0,asm,10013+20000,87.4538
gfx938,int8_w8a8_channel,torch.bfloat16,3,512,2048,256,8,0,0,asm,10010+20000,114.2326
gfx938,int8_w8a8_channel,torch.bfloat16,4,512,2048,256,8,0,0,asm,10013+20000,130.999
gfx938,int8_w8a8_channel,torch.bfloat16,5,512,2048,256,8,0,0,asm,10013+20000,155.5547
gfx938,int8_w8a8_channel,torch.bfloat16,6,512,2048,256,8,0,0,asm,10013+20000,167.4367
gfx938,int8_w8a8_channel,torch.bfloat16,7,512,2048,256,8,0,0,asm,10013+20000,202.2999
gfx938,int8_w8a8_channel,torch.bfloat16,8,512,2048,256,8,0,0,asm,10013+20000,208.5902
gfx938,int8_w8a8_channel,torch.bfloat16,9,512,2048,256,8,0,0,asm,10013+20000,226.6451
gfx938,int8_w8a8_channel,torch.bfloat16,10,512,2048,256,8,0,0,asm,10013+20000,253.2217
gfx938,int8_w8a8_channel,torch.bfloat16,11,512,2048,256,8,0,0,asm,10013+20000,260.186
gfx938,int8_w8a8_channel,torch.bfloat16,12,512,2048,256,8,0,0,asm,10013+20000,270.1901
gfx938,int8_w8a8_channel,torch.bfloat16,13,512,2048,256,8,0,0,asm,10013+20000,296.2028
gfx938,int8_w8a8_channel,torch.bfloat16,14,512,2048,256,8,0,0,asm,10013+20000,309.7185
gfx938,int8_w8a8_channel,torch.bfloat16,15,512,2048,256,8,0,0,asm,10013+20000,318.2238
gfx938,int8_w8a8_channel,torch.bfloat16,16,512,2048,256,8,0,0,asm,10013+20000,340.7668
gfx938,int8_w8a8_channel,torch.bfloat16,17,512,2048,256,8,0,0,asm,10013+20000,335.9133
gfx938,int8_w8a8_channel,torch.bfloat16,18,512,2048,256,8,0,0,asm,10013+20000,342.3722
gfx938,int8_w8a8_channel,torch.bfloat16,20,512,2048,256,8,0,0,asm,10011+20000,377.9258
gfx938,int8_w8a8_channel,torch.bfloat16,24,512,2048,256,8,0,0,asm,10011+20000,422.1447
gfx938,int8_w8a8_channel,torch.bfloat16,28,512,2048,256,8,0,0,asm,10013+20000,491.0148
gfx938,int8_w8a8_channel,torch.bfloat16,32,512,2048,256,8,0,0,asm,10011+20000,519.6043
gfx938,int8_w8a8_channel,torch.bfloat16,34,512,2048,256,8,0,0,asm,10011+20000,508.8813
gfx938,int8_w8a8_channel,torch.bfloat16,36,512,2048,256,8,0,0,asm,10011+20000,523.6266
gfx938,int8_w8a8_channel,torch.bfloat16,40,512,2048,256,8,0,0,asm,10011+20000,552.1233
gfx938,int8_w8a8_channel,torch.bfloat16,44,512,2048,256,8,0,0,asm,10011+20000,567.8369
gfx938,int8_w8a8_channel,torch.bfloat16,48,512,2048,256,8,0,0,asm,10011+20000,582.3801
gfx938,int8_w8a8_channel,torch.bfloat16,56,512,2048,256,8,0,0,asm,10011+20000,605.1421
gfx938,int8_w8a8_channel,torch.bfloat16,64,512,2048,256,8,0,0,asm,10011+20000,631.66
gfx938,int8_w8a8_channel,torch.bfloat16,68,512,2048,256,8,0,0,asm,10011+20000,638.2958
gfx938,int8_w8a8_channel,torch.bfloat16,72,512,2048,256,8,0,0,asm,10011+20000,648.8725
gfx938,int8_w8a8_channel,torch.bfloat16,80,512,2048,256,8,0,0,asm,10011+20000,670.4557
gfx938,int8_w8a8_channel,torch.bfloat16,88,512,2048,256,8,0,0,asm,10011+20000,685.4114
gfx938,int8_w8a8_channel,torch.bfloat16,96,512,2048,256,8,0,0,asm,10011+20000,690.6072
gfx938,int8_w8a8_channel,torch.bfloat16,104,512,2048,256,8,0,0,asm,10011+20000,682.2114
gfx938,int8_w8a8_channel,torch.bfloat16,112,512,2048,256,8,0,0,asm,10011+20000,707.9545
gfx938,int8_w8a8_channel,torch.bfloat16,128,512,2048,256,8,0,0,asm,10011+20000,720.4093
gfx938,int8_w8a8_channel,torch.bfloat16,144,512,2048,256,8,0,0,asm,10011+20000,711.2387
gfx938,int8_w8a8_channel,torch.bfloat16,160,512,2048,256,8,0,0,asm,10011+20000,730.3039
gfx938,int8_w8a8_channel,torch.bfloat16,192,512,2048,256,8,0,0,asm,10011+20000,739.9207
gfx938,int8_w8a8_channel,torch.bfloat16,224,512,2048,256,8,0,0,asm,10012+20000,746.9103
gfx938,int8_w8a8_channel,torch.bfloat16,256,512,2048,256,8,0,0,asm,10011+20000,755.8366
gfx938,int8_w8a8_channel,torch.bfloat16,320,512,2048,256,8,0,0,asm,10011+20000,780.4092
gfx938,int8_w8a8_channel,torch.bfloat16,384,512,2048,256,8,0,0,asm,10011+20000,794.2703
gfx938,int8_w8a8_channel,torch.bfloat16,448,512,2048,256,8,0,0,asm,11005+21000,800.1734
gfx938,int8_w8a8_channel,torch.bfloat16,512,512,2048,256,8,0,0,asm,11005+21000,841.2765
gfx938,int8_w8a8_channel,torch.bfloat16,576,512,2048,256,8,0,0,asm,11007+21000,863.8701
gfx938,int8_w8a8_channel,torch.bfloat16,640,512,2048,256,8,0,0,asm,11005+21000,836.1818
gfx938,int8_w8a8_channel,torch.bfloat16,704,512,2048,256,8,0,0,asm,11005+21000,858.7079
gfx938,int8_w8a8_channel,torch.bfloat16,768,512,2048,256,8,0,0,asm,11005+21000,897.6973
gfx938,int8_w8a8_channel,torch.bfloat16,832,512,2048,256,8,0,0,asm,11005+21000,929.8656
gfx938,int8_w8a8_channel,torch.bfloat16,896,512,2048,256,8,0,0,asm,12001+22000,944.9813
gfx938,int8_w8a8_channel,torch.bfloat16,960,512,2048,256,8,0,0,asm,12001+22000,955.5665
gfx938,int8_w8a8_channel,torch.bfloat16,1024,512,2048,256,8,0,0,asm,12001+22000,965.9664
gfx938,int8_w8a8_channel,torch.bfloat16,1152,512,2048,256,8,0,0,asm,12005+22000,1028.9894
gfx938,int8_w8a8_channel,torch.bfloat16,1280,512,2048,256,8,0,0,asm,12001+22000,999.2968
gfx938,int8_w8a8_channel,torch.bfloat16,1408,512,2048,256,8,0,0,asm,12001+22001,1032.0463
gfx938,int8_w8a8_channel,torch.bfloat16,1536,512,2048,256,8,0,0,asm,12001+22000,1052.8631
gfx938,int8_w8a8_channel,torch.bfloat16,1664,512,2048,256,8,0,0,asm,12001+22001,1086.3956
gfx938,int8_w8a8_channel,torch.bfloat16,1792,512,2048,256,8,0,0,asm,12001+22001,1132.8207
gfx938,int8_w8a8_channel,torch.bfloat16,1920,512,2048,256,8,0,0,asm,12001+22001,1190.7238
gfx938,int8_w8a8_channel,torch.bfloat16,2048,512,2048,256,8,0,0,asm,12005+22001,1308.5508
gfx938,int8_w8a8_channel,torch.bfloat16,2304,512,2048,256,8,0,0,asm,13001+23001,1372.2055
gfx938,int8_w8a8_channel,torch.bfloat16,2560,512,2048,256,8,0,0,asm,13001+23001,1401.258
gfx938,int8_w8a8_channel,torch.bfloat16,2816,512,2048,256,8,0,0,asm,13001+23001,1442.5548
gfx938,int8_w8a8_channel,torch.bfloat16,3072,512,2048,256,8,0,0,asm,13001+23001,1476.5842
gfx938,int8_w8a8_channel,torch.bfloat16,3328,512,2048,256,8,0,0,asm,13001+23001,1541.9145
gfx938,int8_w8a8_channel,torch.bfloat16,3584,512,2048,256,8,0,0,asm,13001+23001,1587.8681
gfx938,int8_w8a8_channel,torch.bfloat16,3840,512,2048,256,8,0,0,asm,13001+23001,1753.4172
gfx938,int8_w8a8_channel,torch.bfloat16,4096,512,2048,256,8,0,0,asm,13001+23001,2012.5745
gfx938,int8_w8a8_channel,torch.bfloat16,4608,512,2048,256,8,0,0,asm,12001+22001,2424.8344
gfx938,int8_w8a8_channel,torch.bfloat16,5120,512,2048,256,8,0,0,asm,12001+22001,2563.3436
gfx938,int8_w8a8_channel,torch.bfloat16,5632,512,2048,256,8,0,0,asm,12001+22001,2694.5769
gfx938,int8_w8a8_channel,torch.bfloat16,6144,512,2048,256,8,0,0,asm,13001+23001,2759.1072
gfx938,int8_w8a8_channel,torch.bfloat16,6656,512,2048,256,8,0,0,asm,13001+23001,2816.3026
gfx938,int8_w8a8_channel,torch.bfloat16,7168,512,2048,256,8,0,0,asm,13001+23001,2898.4665
gfx938,int8_w8a8_channel,torch.bfloat16,7680,512,2048,256,8,0,0,asm,13001+23001,3065.6827
gfx938,int8_w8a8_channel,torch.bfloat16,8192,512,2048,256,8,0,0,asm,13001+23001,3465.1172
gfx938,int8_w8a8_channel,torch.bfloat16,10240,512,2048,256,8,0,0,asm,13001+23001,4148.9045
gfx938,int8_w8a8_channel,torch.bfloat16,12288,512,2048,256,8,0,0,asm,13001+23001,4845.3656
gfx938,int8_w8a8_channel,torch.bfloat16,14336,512,2048,256,8,0,0,asm,13001+23001,5592.5893
gfx938,int8_w8a8_channel,torch.bfloat16,16384,512,2048,256,8,0,0,asm,13001+23001,6308.3012
gfx938,int8_w8a8_channel,torch.bfloat16,17408,512,2048,256,8,0,0,asm,13001+23001,6826.1859
gfx938,int8_w8a8_channel,torch.bfloat16,24576,512,2048,256,8,0,0,asm,13001+23001,9185.3035
gfx938,int8_w8a8_channel,torch.bfloat16,32768,512,2048,256,8,0,0,asm,13001+23001,12019.847
gfx938,int8_w8a8_channel,torch.bfloat16,40960,512,2048,256,8,0,0,asm,13001+23001,14983.3494
gfx938,int8_w8a8_channel,torch.bfloat16,49152,512,2048,256,8,0,0,asm,13001+23001,17938.1767
gfx938,int8_w8a8_channel,torch.bfloat16,57344,512,2048,256,8,0,0,asm,13001+23001,20809.5385
gfx938,int8_w8a8_channel,torch.bfloat16,65536,512,2048,256,8,0,0,asm,13001+23001,23354.4759
gfx938,int8_w8a8_channel,torch.bfloat16,65536,512,2048,256,8,0,0,asm,13001+23001,23370.3628
gfx938,f8_w8a8_channel,torch.bfloat16,1,320,6144,160,8,0,0,asm,10008+20000,87.8748
gfx938,f8_w8a8_channel,torch.bfloat16,2,320,6144,160,8,0,0,asm,10011+20001,129.4152
gfx938,f8_w8a8_channel,torch.bfloat16,3,320,6144,160,8,0,0,asm,10008+20000,160.6829
gfx938,f8_w8a8_channel,torch.bfloat16,4,320,6144,160,8,0,0,asm,10002+20000,199.8576
gfx938,f8_w8a8_channel,torch.bfloat16,5,320,6144,160,8,0,0,asm,10009+20000,224.0766
gfx938,f8_w8a8_channel,torch.bfloat16,6,320,6144,160,8,0,0,asm,10002+20000,266.0216
gfx938,f8_w8a8_channel,torch.bfloat16,7,320,6144,160,8,0,0,asm,10008+20000,284.0428
gfx938,f8_w8a8_channel,torch.bfloat16,8,320,6144,160,8,0,0,asm,10011+20000,329.752
gfx938,f8_w8a8_channel,torch.bfloat16,9,320,6144,160,8,0,0,asm,10011+20000,349.2551
gfx938,f8_w8a8_channel,torch.bfloat16,10,320,6144,160,8,0,0,asm,10002+20000,407.293
gfx938,f8_w8a8_channel,torch.bfloat16,11,320,6144,160,8,0,0,asm,10008+20000,425.7182
gfx938,f8_w8a8_channel,torch.bfloat16,12,320,6144,160,8,0,0,asm,10011+20000,445.836
gfx938,f8_w8a8_channel,torch.bfloat16,13,320,6144,160,8,0,0,asm,10011+20000,458.4254
gfx938,f8_w8a8_channel,torch.bfloat16,14,320,6144,160,8,0,0,asm,10008+20000,493.6169
gfx938,f8_w8a8_channel,torch.bfloat16,15,320,6144,160,8,0,0,asm,10011+20000,521.6674
gfx938,f8_w8a8_channel,torch.bfloat16,16,320,6144,160,8,0,0,asm,10011+20000,529.3306
gfx938,f8_w8a8_channel,torch.bfloat16,17,320,6144,160,8,0,0,asm,10011+20000,532.4435
gfx938,f8_w8a8_channel,torch.bfloat16,18,320,6144,160,8,0,0,asm,10011+20000,545.1423
gfx938,f8_w8a8_channel,torch.bfloat16,20,320,6144,160,8,0,0,asm,10011+20000,600.0979
gfx938,f8_w8a8_channel,torch.bfloat16,24,320,6144,160,8,0,0,asm,10012+20000,637.3948
gfx938,f8_w8a8_channel,torch.bfloat16,28,320,6144,160,8,0,0,asm,10011+20000,661.3219
gfx938,f8_w8a8_channel,torch.bfloat16,32,320,6144,160,8,0,0,asm,10011+20000,712.3954
gfx938,f8_w8a8_channel,torch.bfloat16,34,320,6144,160,8,0,0,asm,10011+20000,714.4303
gfx938,f8_w8a8_channel,torch.bfloat16,36,320,6144,160,8,0,0,asm,10011+20000,726.3377
gfx938,f8_w8a8_channel,torch.bfloat16,40,320,6144,160,8,0,0,asm,10012+20000,733.1418
gfx938,f8_w8a8_channel,torch.bfloat16,44,320,6144,160,8,0,0,asm,10011+20000,739.8449
gfx938,f8_w8a8_channel,torch.bfloat16,48,320,6144,160,8,0,0,asm,11005+21000,783.7775
gfx938,f8_w8a8_channel,torch.bfloat16,56,320,6144,160,8,0,0,asm,10011+20000,804.1311
gfx938,f8_w8a8_channel,torch.bfloat16,64,320,6144,160,8,0,0,asm,10011+20000,810.3711
gfx938,f8_w8a8_channel,torch.bfloat16,68,320,6144,160,8,0,0,asm,10012+20000,810.5901
gfx938,f8_w8a8_channel,torch.bfloat16,72,320,6144,160,8,0,0,asm,10011+20000,813.4195
gfx938,f8_w8a8_channel,torch.bfloat16,80,320,6144,160,8,0,0,asm,10011+20000,824.6364
gfx938,f8_w8a8_channel,torch.bfloat16,88,320,6144,160,8,0,0,asm,10012+20000,828.4427
gfx938,f8_w8a8_channel,torch.bfloat16,96,320,6144,160,8,0,0,asm,10011+20000,831.988
gfx938,f8_w8a8_channel,torch.bfloat16,104,320,6144,160,8,0,0,asm,10011+20000,838.1268
gfx938,f8_w8a8_channel,torch.bfloat16,112,320,6144,160,8,0,0,asm,10011+20000,840.889
gfx938,f8_w8a8_channel,torch.bfloat16,128,320,6144,160,8,0,0,asm,10011+20000,850.2531
gfx938,f8_w8a8_channel,torch.bfloat16,144,320,6144,160,8,0,0,asm,10012+20000,859.3393
gfx938,f8_w8a8_channel,torch.bfloat16,160,320,6144,160,8,0,0,asm,10011+20000,864.9309
gfx938,f8_w8a8_channel,torch.bfloat16,192,320,6144,160,8,0,0,asm,11005+21000,882.9772
gfx938,f8_w8a8_channel,torch.bfloat16,224,320,6144,160,8,0,0,asm,10011+20000,898.3287
gfx938,f8_w8a8_channel,torch.bfloat16,256,320,6144,160,8,0,0,asm,11005+21000,912.3244
gfx938,f8_w8a8_channel,torch.bfloat16,320,320,6144,160,8,0,0,asm,11005+21000,943.9706
gfx938,f8_w8a8_channel,torch.bfloat16,384,320,6144,160,8,0,0,asm,11005+21000,1033.402
gfx938,f8_w8a8_channel,torch.bfloat16,448,320,6144,160,8,0,0,asm,11005+21000,997.2927
gfx938,f8_w8a8_channel,torch.bfloat16,512,320,6144,160,8,0,0,asm,11005+21000,1031.0105
gfx938,f8_w8a8_channel,torch.bfloat16,576,320,6144,160,8,0,0,asm,11005+21000,1134.2692
gfx938,f8_w8a8_channel,torch.bfloat16,640,320,6144,160,8,0,0,asm,12000+22001,1198.9596
gfx938,f8_w8a8_channel,torch.bfloat16,704,320,6144,160,8,0,0,asm,12001+22001,1171.1534
gfx938,f8_w8a8_channel,torch.bfloat16,768,320,6144,160,8,0,0,asm,12001+22001,1242.6059
gfx938,f8_w8a8_channel,torch.bfloat16,832,320,6144,160,8,0,0,asm,12001+22001,1202.7659
gfx938,f8_w8a8_channel,torch.bfloat16,896,320,6144,160,8,0,0,asm,12001+22001,1210.446
gfx938,f8_w8a8_channel,torch.bfloat16,960,320,6144,160,8,0,0,asm,12001+22001,1248.0121
gfx938,f8_w8a8_channel,torch.bfloat16,1024,320,6144,160,8,0,0,asm,12001+22001,1277.2499
gfx938,f8_w8a8_channel,torch.bfloat16,1152,320,6144,160,8,0,0,asm,12001+22001,1399.4727
gfx938,f8_w8a8_channel,torch.bfloat16,1280,320,6144,160,8,0,0,asm,12001+22001,1749.5096
gfx938,f8_w8a8_channel,torch.bfloat16,1408,320,6144,160,8,0,0,asm,13001+23001,1848.2125
gfx938,f8_w8a8_channel,torch.bfloat16,1536,320,6144,160,8,0,0,asm,13001+23001,1873.7618
gfx938,f8_w8a8_channel,torch.bfloat16,1664,320,6144,160,8,0,0,asm,13001+23001,1907.8164
gfx938,f8_w8a8_channel,torch.bfloat16,1792,320,6144,160,8,0,0,asm,13001+23001,1935.1678
gfx938,f8_w8a8_channel,torch.bfloat16,1920,320,6144,160,8,0,0,asm,13001+23001,1966.5951
gfx938,f8_w8a8_channel,torch.bfloat16,2048,320,6144,160,8,0,0,asm,13001+23001,1995.8666
gfx938,f8_w8a8_channel,torch.bfloat16,2304,320,6144,160,8,0,0,asm,13001+23001,2217.6428
gfx938,f8_w8a8_channel,torch.bfloat16,2560,320,6144,160,8,0,0,asm,13001+23001,2676.4204
gfx938,f8_w8a8_channel,torch.bfloat16,2816,320,6144,160,8,0,0,asm,12001+22001,3274.8019
gfx938,f8_w8a8_channel,torch.bfloat16,3072,320,6144,160,8,0,0,asm,12001+22001,3447.8037
gfx938,f8_w8a8_channel,torch.bfloat16,3328,320,6144,160,8,0,0,asm,12001+22001,3513.1594
gfx938,f8_w8a8_channel,torch.bfloat16,3584,320,6144,160,8,0,0,asm,13001+23001,3681.1507
gfx938,f8_w8a8_channel,torch.bfloat16,3840,320,6144,160,8,0,0,asm,13001+23001,3740.8308
gfx938,f8_w8a8_channel,torch.bfloat16,4096,320,6144,160,8,0,0,asm,13001+23001,3787.5591
gfx938,f8_w8a8_channel,torch.bfloat16,4608,320,6144,160,8,0,0,asm,13001+23001,3988.2912
gfx938,f8_w8a8_channel,torch.bfloat16,5120,320,6144,160,8,0,0,asm,13001+23001,4664.4407
gfx938,f8_w8a8_channel,torch.bfloat16,5632,320,6144,160,8,0,0,asm,13001+23001,5438.1986
gfx938,f8_w8a8_channel,torch.bfloat16,6144,320,6144,160,8,0,0,asm,13001+23001,5639.4108
gfx938,f8_w8a8_channel,torch.bfloat16,6656,320,6144,160,8,0,0,asm,13001+23001,5753.8779
gfx938,f8_w8a8_channel,torch.bfloat16,7168,320,6144,160,8,0,0,asm,13001+23000,6060.2096
gfx938,f8_w8a8_channel,torch.bfloat16,7680,320,6144,160,8,0,0,asm,13001+23001,6691.5508
gfx938,f8_w8a8_channel,torch.bfloat16,8192,320,6144,160,8,0,0,asm,13001+23001,7332.6939
gfx938,f8_w8a8_channel,torch.bfloat16,10240,320,6144,160,8,0,0,asm,13001+23001,8524.7909
gfx938,f8_w8a8_channel,torch.bfloat16,12288,320,6144,160,8,0,0,asm,13001+23001,9910.8836
gfx938,f8_w8a8_channel,torch.bfloat16,14336,320,6144,160,8,0,0,asm,13001+23001,11609.6582
gfx938,f8_w8a8_channel,torch.bfloat16,16384,320,6144,160,8,0,0,asm,13001+23001,13364.4164
gfx938,f8_w8a8_channel,torch.bfloat16,17408,320,6144,160,8,0,0,asm,13001+23001,13945.3495
gfx938,f8_w8a8_channel,torch.bfloat16,24576,320,6144,160,8,0,0,asm,13001+23001,19386.0554
gfx938,f8_w8a8_channel,torch.bfloat16,32768,320,6144,160,8,0,0,asm,13001+23001,25609.6308
gfx938,f8_w8a8_channel,torch.bfloat16,40960,320,6144,160,8,0,0,asm,13001+23001,31906.0787
gfx938,f8_w8a8_channel,torch.bfloat16,49152,320,6144,160,8,0,0,asm,13001+23001,38174.2552
gfx938,f8_w8a8_channel,torch.bfloat16,57344,320,6144,160,8,0,0,asm,13001+23001,44395.4213
gfx938,f8_w8a8_channel,torch.bfloat16,65536,320,6144,160,8,0,0,asm,13001+23001,50864.8195
gfx938,f8_w8a8_channel,torch.bfloat16,65536,320,6144,160,8,0,0,asm,13001+23001,50655.7567
gfx938,f8_w8a8_channel,torch.float16,1,320,6144,160,8,0,0,asm,10008+20000,86.8391
gfx938,f8_w8a8_channel,torch.float16,2,320,6144,160,8,0,0,asm,10012+20000,129.7864
gfx938,f8_w8a8_channel,torch.float16,3,320,6144,160,8,0,0,asm,10008+20000,160.3293
gfx938,f8_w8a8_channel,torch.float16,4,320,6144,160,8,0,0,asm,10002+20000,199.0578
gfx938,f8_w8a8_channel,torch.float16,5,320,6144,160,8,0,0,asm,10009+20000,224.0428
gfx938,f8_w8a8_channel,torch.float16,6,320,6144,160,8,0,0,asm,10002+20000,267.9417
gfx938,f8_w8a8_channel,torch.float16,7,320,6144,160,8,0,0,asm,10008+20000,285.1374
gfx938,f8_w8a8_channel,torch.float16,8,320,6144,160,8,0,0,asm,10002+20000,337.491
gfx938,f8_w8a8_channel,torch.float16,9,320,6144,160,8,0,0,asm,10011+20000,349.1963
gfx938,f8_w8a8_channel,torch.float16,10,320,6144,160,8,0,0,asm,10002+20000,405.5077
gfx938,f8_w8a8_channel,torch.float16,11,320,6144,160,8,0,0,asm,10008+20000,425.0951
gfx938,f8_w8a8_channel,torch.float16,12,320,6144,160,8,0,0,asm,10012+20000,453.8696
gfx938,f8_w8a8_channel,torch.float16,13,320,6144,160,8,0,0,asm,10011+20000,459.6549
gfx938,f8_w8a8_channel,torch.float16,14,320,6144,160,8,0,0,asm,10008+20000,493.0359
gfx938,f8_w8a8_channel,torch.float16,15,320,6144,160,8,0,0,asm,10011+20000,520.7074
gfx938,f8_w8a8_channel,torch.float16,16,320,6144,160,8,0,0,asm,10011+20000,530.198
gfx938,f8_w8a8_channel,torch.float16,17,320,6144,160,8,0,0,asm,10011+20000,532.9151
gfx938,f8_w8a8_channel,torch.float16,18,320,6144,160,8,0,0,asm,10011+20000,542.9361
gfx938,f8_w8a8_channel,torch.float16,20,320,6144,160,8,0,0,asm,10008+20000,601.6643
gfx938,f8_w8a8_channel,torch.float16,24,320,6144,160,8,0,0,asm,10011+20000,632.7463
gfx938,f8_w8a8_channel,torch.float16,28,320,6144,160,8,0,0,asm,10011+20000,661.8019
gfx938,f8_w8a8_channel,torch.float16,32,320,6144,160,8,0,0,asm,10012+20000,715.9659
gfx938,f8_w8a8_channel,torch.float16,34,320,6144,160,8,0,0,asm,10011+20000,716.8639
gfx938,f8_w8a8_channel,torch.float16,36,320,6144,160,8,0,0,asm,10011+20000,725.8576
gfx938,f8_w8a8_channel,torch.float16,40,320,6144,160,8,0,0,asm,10011+20000,731.6175
gfx938,f8_w8a8_channel,torch.float16,44,320,6144,160,8,0,0,asm,10011+20000,736.8048
gfx938,f8_w8a8_channel,torch.float16,48,320,6144,160,8,0,0,asm,10012+20000,825.9498
gfx938,f8_w8a8_channel,torch.float16,56,320,6144,160,8,0,0,asm,10012+20000,804.4173
gfx938,f8_w8a8_channel,torch.float16,64,320,6144,160,8,0,0,asm,10011+20000,808.5773
gfx938,f8_w8a8_channel,torch.float16,68,320,6144,160,8,0,0,asm,10011+20000,807.6089
gfx938,f8_w8a8_channel,torch.float16,72,320,6144,160,8,0,0,asm,10011+20000,815.5415
gfx938,f8_w8a8_channel,torch.float16,80,320,6144,160,8,0,0,asm,10011+20000,825.4531
gfx938,f8_w8a8_channel,torch.float16,88,320,6144,160,8,0,0,asm,10011+20000,825.4868
gfx938,f8_w8a8_channel,torch.float16,96,320,6144,160,8,0,0,asm,10011+20000,830.8931
gfx938,f8_w8a8_channel,torch.float16,104,320,6144,160,8,0,0,asm,10012+20000,838.5309
gfx938,f8_w8a8_channel,torch.float16,112,320,6144,160,8,0,0,asm,10011+20000,841.9751
gfx938,f8_w8a8_channel,torch.float16,128,320,6144,160,8,0,0,asm,10012+20000,850.0088
gfx938,f8_w8a8_channel,torch.float16,144,320,6144,160,8,0,0,asm,10012+20000,857.8152
gfx938,f8_w8a8_channel,torch.float16,160,320,6144,160,8,0,0,asm,10011+20000,866.2614
gfx938,f8_w8a8_channel,torch.float16,192,320,6144,160,8,0,0,asm,10011+20000,881.3939
gfx938,f8_w8a8_channel,torch.float16,224,320,6144,160,8,0,0,asm,10011+20000,896.1813
gfx938,f8_w8a8_channel,torch.float16,256,320,6144,160,8,0,0,asm,11005+21000,912.0044
gfx938,f8_w8a8_channel,torch.float16,320,320,6144,160,8,0,0,asm,11005+21000,943.1369
gfx938,f8_w8a8_channel,torch.float16,384,320,6144,160,8,0,0,asm,11005+21000,975.6336
gfx938,f8_w8a8_channel,torch.float16,448,320,6144,160,8,0,0,asm,11005+21000,994.6231
gfx938,f8_w8a8_channel,torch.float16,512,320,6144,160,8,0,0,asm,11005+21000,1028.1387
gfx938,f8_w8a8_channel,torch.float16,576,320,6144,160,8,0,0,asm,11005+21000,1131.5826
gfx938,f8_w8a8_channel,torch.float16,640,320,6144,160,8,0,0,asm,12003+22001,1166.0162
gfx938,f8_w8a8_channel,torch.float16,704,320,6144,160,8,0,0,asm,12001+22001,1164.4584
gfx938,f8_w8a8_channel,torch.float16,768,320,6144,160,8,0,0,asm,12001+22001,1186.3363
gfx938,f8_w8a8_channel,torch.float16,832,320,6144,160,8,0,0,asm,12001+22001,1196.6605
gfx938,f8_w8a8_channel,torch.float16,896,320,6144,160,8,0,0,asm,12001+22001,1213.2753
gfx938,f8_w8a8_channel,torch.float16,960,320,6144,160,8,0,0,asm,12001+22001,1245.5952
gfx938,f8_w8a8_channel,torch.float16,1024,320,6144,160,8,0,0,asm,12001+22001,1274.4457
gfx938,f8_w8a8_channel,torch.float16,1152,320,6144,160,8,0,0,asm,12001+22001,1415.7254
gfx938,f8_w8a8_channel,torch.float16,1280,320,6144,160,8,0,0,asm,12001+22001,1862.7136
gfx938,f8_w8a8_channel,torch.float16,1408,320,6144,160,8,0,0,asm,13001+23001,1842.0736
gfx938,f8_w8a8_channel,torch.float16,1536,320,6144,160,8,0,0,asm,13001+23001,1869.8966
gfx938,f8_w8a8_channel,torch.float16,1664,320,6144,160,8,0,0,asm,13001+23001,1899.6228
gfx938,f8_w8a8_channel,torch.float16,1792,320,6144,160,8,0,0,asm,13001+23001,1928.2289
gfx938,f8_w8a8_channel,torch.float16,1920,320,6144,160,8,0,0,asm,13001+23001,1961.2308
gfx938,f8_w8a8_channel,torch.float16,2048,320,6144,160,8,0,0,asm,13001+23001,1987.4959
gfx938,f8_w8a8_channel,torch.float16,2304,320,6144,160,8,0,0,asm,13001+23001,2205.8699
gfx938,f8_w8a8_channel,torch.float16,2560,320,6144,160,8,0,0,asm,13001+23001,2666.6012
gfx938,f8_w8a8_channel,torch.float16,2816,320,6144,160,8,0,0,asm,12001+22001,3269.1511
gfx938,f8_w8a8_channel,torch.float16,3072,320,6144,160,8,0,0,asm,12001+22001,3444.1653
gfx938,f8_w8a8_channel,torch.float16,3328,320,6144,160,8,0,0,asm,12001+22001,3507.9211
gfx938,f8_w8a8_channel,torch.float16,3584,320,6144,160,8,0,0,asm,13001+23001,3695.8705
gfx938,f8_w8a8_channel,torch.float16,3840,320,6144,160,8,0,0,asm,13001+23001,3735.5589
gfx938,f8_w8a8_channel,torch.float16,4096,320,6144,160,8,0,0,asm,13001+23001,3786.7084
gfx938,f8_w8a8_channel,torch.float16,4608,320,6144,160,8,0,0,asm,13001+23001,3975.1794
gfx938,f8_w8a8_channel,torch.float16,5120,320,6144,160,8,0,0,asm,13001+23001,4655.5142
gfx938,f8_w8a8_channel,torch.float16,5632,320,6144,160,8,0,0,asm,13001+23001,5420.9347
gfx938,f8_w8a8_channel,torch.float16,6144,320,6144,160,8,0,0,asm,13001+23001,5629.4392
gfx938,f8_w8a8_channel,torch.float16,6656,320,6144,160,8,0,0,asm,13001+23001,5746.9548
gfx938,f8_w8a8_channel,torch.float16,7168,320,6144,160,8,0,0,asm,13001+23001,5931.0555
gfx938,f8_w8a8_channel,torch.float16,7680,320,6144,160,8,0,0,asm,13001+23001,6678.6912
gfx938,f8_w8a8_channel,torch.float16,8192,320,6144,160,8,0,0,asm,13001+23001,7327.6405
gfx938,f8_w8a8_channel,torch.float16,10240,320,6144,160,8,0,0,asm,13001+23001,8520.8914
gfx938,f8_w8a8_channel,torch.float16,12288,320,6144,160,8,0,0,asm,13001+23000,10064.4825
gfx938,f8_w8a8_channel,torch.float16,14336,320,6144,160,8,0,0,asm,13001+23001,11593.2692
gfx938,f8_w8a8_channel,torch.float16,16384,320,6144,160,8,0,0,asm,13001+23001,13341.9031
gfx938,f8_w8a8_channel,torch.float16,17408,320,6144,160,8,0,0,asm,13001+23001,13929.9271
gfx938,f8_w8a8_channel,torch.float16,24576,320,6144,160,8,0,0,asm,13001+23001,19364.5646
gfx938,f8_w8a8_channel,torch.float16,32768,320,6144,160,8,0,0,asm,13001+23001,25569.9553
gfx938,f8_w8a8_channel,torch.float16,40960,320,6144,160,8,0,0,asm,13001+23001,31866.0491
gfx938,f8_w8a8_channel,torch.float16,49152,320,6144,160,8,0,0,asm,13001+23001,38129.8607
gfx938,f8_w8a8_channel,torch.float16,57344,320,6144,160,8,0,0,asm,13001+23001,44350.9585
gfx938,f8_w8a8_channel,torch.float16,65536,320,6144,160,8,0,0,asm,13001+23001,50658.3987
gfx938,f8_w8a8_channel,torch.float16,65536,320,6144,160,8,0,0,asm,13001+23001,50595.5307
gfx938,f8_w8a8_channel,torch.bfloat16,1,256,4096,512,10,0,0,asm,10008+20000,85.98
gfx938,f8_w8a8_channel,torch.bfloat16,2,256,4096,512,10,0,0,asm,10011+20001,118.039
gfx938,f8_w8a8_channel,torch.bfloat16,3,256,4096,512,10,0,0,asm,10011+20000,135.8747
gfx938,f8_w8a8_channel,torch.bfloat16,4,256,4096,512,10,0,0,asm,10002+20001,172.7083
gfx938,f8_w8a8_channel,torch.bfloat16,5,256,4096,512,10,0,0,asm,10008+20001,185.7608
gfx938,f8_w8a8_channel,torch.bfloat16,6,256,4096,512,10,0,0,asm,10011+20001,206.0303
gfx938,f8_w8a8_channel,torch.bfloat16,7,256,4096,512,10,0,0,asm,10011+20000,225.2808
gfx938,f8_w8a8_channel,torch.bfloat16,8,256,4096,512,10,0,0,asm,10002+20000,267.5038
gfx938,f8_w8a8_channel,torch.bfloat16,9,256,4096,512,10,0,0,asm,10011+20000,286.0384
gfx938,f8_w8a8_channel,torch.bfloat16,10,256,4096,512,10,0,0,asm,10011+20000,299.1836
gfx938,f8_w8a8_channel,torch.bfloat16,11,256,4096,512,10,0,0,asm,10011+20000,317.5753
gfx938,f8_w8a8_channel,torch.bfloat16,12,256,4096,512,10,0,0,asm,10002+20000,358.7036
gfx938,f8_w8a8_channel,torch.bfloat16,13,256,4096,512,10,0,0,asm,10011+20000,376.0678
gfx938,f8_w8a8_channel,torch.bfloat16,14,256,4096,512,10,0,0,asm,10011+20000,385.3896
gfx938,f8_w8a8_channel,torch.bfloat16,15,256,4096,512,10,0,0,asm,10002+20000,434.7706
gfx938,f8_w8a8_channel,torch.bfloat16,16,256,4096,512,10,0,0,asm,10011+20000,452.497
gfx938,f8_w8a8_channel,torch.bfloat16,17,256,4096,512,10,0,0,asm,10011+20000,436.5192
gfx938,f8_w8a8_channel,torch.bfloat16,18,256,4096,512,10,0,0,asm,10011+20000,451.9129
gfx938,f8_w8a8_channel,torch.bfloat16,20,256,4096,512,10,0,0,asm,10011+20000,506.1359
gfx938,f8_w8a8_channel,torch.bfloat16,24,256,4096,512,10,0,0,asm,10011+20000,578.1441
gfx938,f8_w8a8_channel,torch.bfloat16,28,256,4096,512,10,0,0,asm,10011+20000,618.8681
gfx938,f8_w8a8_channel,torch.bfloat16,32,256,4096,512,10,0,0,asm,10011+20000,715.6595
gfx938,f8_w8a8_channel,torch.bfloat16,34,256,4096,512,10,0,0,asm,10011+20001,720.6531
gfx938,f8_w8a8_channel,torch.bfloat16,36,256,4096,512,10,0,0,asm,10011+20001,769.3266
gfx938,f8_w8a8_channel,torch.bfloat16,40,256,4096,512,10,0,0,asm,10011+20000,770.5728
gfx938,f8_w8a8_channel,torch.bfloat16,44,256,4096,512,10,0,0,asm,10011+20000,838.8926
gfx938,f8_w8a8_channel,torch.bfloat16,48,256,4096,512,10,0,0,asm,10011+20000,867.4567
gfx938,f8_w8a8_channel,torch.bfloat16,56,256,4096,512,10,0,0,asm,10011+20000,942.9175
gfx938,f8_w8a8_channel,torch.bfloat16,64,256,4096,512,10,0,0,asm,10011+20000,1019.0772
gfx938,f8_w8a8_channel,torch.bfloat16,68,256,4096,512,10,0,0,asm,10011+20000,1014.0247
gfx938,f8_w8a8_channel,torch.bfloat16,72,256,4096,512,10,0,0,asm,11005+21001,1107.2792
gfx938,f8_w8a8_channel,torch.bfloat16,80,256,4096,512,10,0,0,asm,10011+20000,1174.3611
gfx938,f8_w8a8_channel,torch.bfloat16,88,256,4096,512,10,0,0,asm,10011+20000,1141.0054
gfx938,f8_w8a8_channel,torch.bfloat16,96,256,4096,512,10,0,0,asm,10011+20000,1172.3738
gfx938,f8_w8a8_channel,torch.bfloat16,104,256,4096,512,10,0,0,asm,10011+20000,1203.7
gfx938,f8_w8a8_channel,torch.bfloat16,112,256,4096,512,10,0,0,asm,10012+20000,1230.1001
gfx938,f8_w8a8_channel,torch.bfloat16,128,256,4096,512,10,0,0,asm,10012+20000,1260.7946
gfx938,f8_w8a8_channel,torch.bfloat16,144,256,4096,512,10,0,0,asm,10011+20000,1281.3336
gfx938,f8_w8a8_channel,torch.bfloat16,160,256,4096,512,10,0,0,asm,10011+20000,1296.1208
gfx938,f8_w8a8_channel,torch.bfloat16,192,256,4096,512,10,0,0,asm,10011+20000,1331.9944
gfx938,f8_w8a8_channel,torch.bfloat16,224,256,4096,512,10,0,0,asm,10011+20001,1461.4509
gfx938,f8_w8a8_channel,torch.bfloat16,256,256,4096,512,10,0,0,asm,10012+20000,1374.63
gfx938,f8_w8a8_channel,torch.bfloat16,320,256,4096,512,10,0,0,asm,10011+20000,1409.8298
gfx938,f8_w8a8_channel,torch.bfloat16,384,256,4096,512,10,0,0,asm,10011+20000,1436.2382
gfx938,f8_w8a8_channel,torch.bfloat16,448,256,4096,512,10,0,0,asm,10011+20000,1479.8002
gfx938,f8_w8a8_channel,torch.bfloat16,512,256,4096,512,10,0,0,asm,10011+20000,1510.2592
gfx938,f8_w8a8_channel,torch.bfloat16,576,256,4096,512,10,0,0,asm,11005+21000,1584.1538
gfx938,f8_w8a8_channel,torch.bfloat16,640,256,4096,512,10,0,0,asm,11005+21000,1550.1751
gfx938,f8_w8a8_channel,torch.bfloat16,704,256,4096,512,10,0,0,asm,11005+21000,1570.1246
gfx938,f8_w8a8_channel,torch.bfloat16,768,256,4096,512,10,0,0,asm,11005+21000,1586.2341
gfx938,f8_w8a8_channel,torch.bfloat16,832,256,4096,512,10,0,0,asm,11005+21000,1601.8551
gfx938,f8_w8a8_channel,torch.bfloat16,896,256,4096,512,10,0,0,asm,11005+21001,1655.2444
gfx938,f8_w8a8_channel,torch.bfloat16,960,256,4096,512,10,0,0,asm,11005+21000,1640.3982
gfx938,f8_w8a8_channel,torch.bfloat16,1024,256,4096,512,10,0,0,asm,11005+21000,1671.8927
gfx938,f8_w8a8_channel,torch.bfloat16,1152,256,4096,512,10,0,0,asm,11005+21000,1730.5199
gfx938,f8_w8a8_channel,torch.bfloat16,1280,256,4096,512,10,0,0,asm,11005+21000,1781.442
gfx938,f8_w8a8_channel,torch.bfloat16,1408,256,4096,512,10,0,0,asm,11005+21001,1842.9829
gfx938,f8_w8a8_channel,torch.bfloat16,1536,256,4096,512,10,0,0,asm,11005+21001,1951.3363
gfx938,f8_w8a8_channel,torch.bfloat16,1664,256,4096,512,10,0,0,asm,12001+22001,1998.9403
gfx938,f8_w8a8_channel,torch.bfloat16,1792,256,4096,512,10,0,0,asm,12001+22001,2012.1022
gfx938,f8_w8a8_channel,torch.bfloat16,1920,256,4096,512,10,0,0,asm,12001+22001,2040.3715
gfx938,f8_w8a8_channel,torch.bfloat16,2048,256,4096,512,10,0,0,asm,12001+22001,2059.7652
gfx938,f8_w8a8_channel,torch.bfloat16,2304,256,4096,512,10,0,0,asm,12001+22001,2132.1608
gfx938,f8_w8a8_channel,torch.bfloat16,2560,256,4096,512,10,0,0,asm,12001+22001,2200.5735
gfx938,f8_w8a8_channel,torch.bfloat16,2816,256,4096,512,10,0,0,asm,12001+22001,2329.0953
gfx938,f8_w8a8_channel,torch.bfloat16,3072,256,4096,512,10,0,0,asm,12001+22001,2593.4905
gfx938,f8_w8a8_channel,torch.bfloat16,3328,256,4096,512,10,0,0,asm,12001+22001,2998.7442
gfx938,f8_w8a8_channel,torch.bfloat16,3584,256,4096,512,10,0,0,asm,13001+23001,3227.1983
gfx938,f8_w8a8_channel,torch.bfloat16,3840,256,4096,512,10,0,0,asm,13001+23001,3274.2464
gfx938,f8_w8a8_channel,torch.bfloat16,4096,256,4096,512,10,0,0,asm,13001+23001,3308.0736
gfx938,f8_w8a8_channel,torch.bfloat16,4608,256,4096,512,10,0,0,asm,13001+23001,3420.7888
gfx938,f8_w8a8_channel,torch.bfloat16,5120,256,4096,512,10,0,0,asm,13001+23001,3562.9356
gfx938,f8_w8a8_channel,torch.bfloat16,5632,256,4096,512,10,0,0,asm,13001+23001,3679.8195
gfx938,f8_w8a8_channel,torch.bfloat16,6144,256,4096,512,10,0,0,asm,13001+23001,4237.0553
gfx938,f8_w8a8_channel,torch.bfloat16,6656,256,4096,512,10,0,0,asm,13001+23001,5043.5704
gfx938,f8_w8a8_channel,torch.bfloat16,7168,256,4096,512,10,0,0,asm,12001+22001,5577.7081
gfx938,f8_w8a8_channel,torch.bfloat16,7680,256,4096,512,10,0,0,asm,12001+22001,5867.8889
gfx938,f8_w8a8_channel,torch.bfloat16,8192,256,4096,512,10,0,0,asm,12001+22001,6024.8825
gfx938,f8_w8a8_channel,torch.bfloat16,10240,256,4096,512,10,0,0,asm,13001+23001,6800.9394
gfx938,f8_w8a8_channel,torch.bfloat16,12288,256,4096,512,10,0,0,asm,13001+23001,7508.4403
gfx938,f8_w8a8_channel,torch.bfloat16,14336,256,4096,512,10,0,0,asm,13001+23001,9580.6614
gfx938,f8_w8a8_channel,torch.bfloat16,16384,256,4096,512,10,0,0,asm,13001+23001,10215.7416
gfx938,f8_w8a8_channel,torch.bfloat16,17408,256,4096,512,10,0,0,asm,13001+23001,10499.4128
gfx938,f8_w8a8_channel,torch.bfloat16,24576,256,4096,512,10,0,0,asm,13001+23001,14172.5893
gfx938,f8_w8a8_channel,torch.bfloat16,32768,256,4096,512,10,0,0,asm,13001+23001,18841.2993
gfx938,f8_w8a8_channel,torch.bfloat16,40960,256,4096,512,10,0,0,asm,13001+23001,23386.5547
gfx938,f8_w8a8_channel,torch.bfloat16,49152,256,4096,512,10,0,0,asm,13001+23001,27573.5355
gfx938,f8_w8a8_channel,torch.bfloat16,57344,256,4096,512,10,0,0,asm,13001+23001,31773.8919
gfx938,f8_w8a8_channel,torch.bfloat16,65536,256,4096,512,10,0,0,asm,13001+23001,36230.8804
gfx938,f8_w8a8_channel,torch.bfloat16,65536,256,4096,512,10,0,0,asm,13001+23001,36217.3488
gfx936,int8_w8a8_channel,torch.float16,1,512,2048,256,8,0,0,asm,10010+20000,57.5423
gfx936,int8_w8a8_channel,torch.float16,2,512,2048,256,8,0,0,asm,10013+20000,75.8832
gfx936,int8_w8a8_channel,torch.float16,3,512,2048,256,8,0,0,asm,10013+20000,103.3274
gfx936,int8_w8a8_channel,torch.float16,4,512,2048,256,8,0,0,asm,10013+20000,118.0053
gfx936,int8_w8a8_channel,torch.float16,5,512,2048,256,8,0,0,asm,10013+20000,140.1358
gfx936,int8_w8a8_channel,torch.float16,6,512,2048,256,8,0,0,asm,10013+20000,153.1881
gfx936,int8_w8a8_channel,torch.float16,7,512,2048,256,8,0,0,asm,10013+20000,181.3652
gfx936,int8_w8a8_channel,torch.float16,8,512,2048,256,8,0,0,asm,10013+20000,192.2788
gfx936,int8_w8a8_channel,torch.float16,9,512,2048,256,8,0,0,asm,10013+20000,208.2451
gfx936,int8_w8a8_channel,torch.float16,10,512,2048,256,8,0,0,asm,10008+20000,228.9861
gfx936,int8_w8a8_channel,torch.float16,11,512,2048,256,8,0,0,asm,10013+20000,240.826
gfx936,int8_w8a8_channel,torch.float16,12,512,2048,256,8,0,0,asm,10013+20000,251.5291
gfx936,int8_w8a8_channel,torch.float16,13,512,2048,256,8,0,0,asm,10011+20000,273.2555
gfx936,int8_w8a8_channel,torch.float16,14,512,2048,256,8,0,0,asm,10013+20000,285.9122
gfx936,int8_w8a8_channel,torch.float16,15,512,2048,256,8,0,0,asm,10013+20000,297.1289
gfx936,int8_w8a8_channel,torch.float16,16,512,2048,256,8,0,0,asm,10008+20000,309.1964
gfx936,int8_w8a8_channel,torch.float16,17,512,2048,256,8,0,0,asm,10009+20000,312.5196
gfx936,int8_w8a8_channel,torch.float16,18,512,2048,256,8,0,0,asm,10013+20000,320.166
gfx936,int8_w8a8_channel,torch.float16,20,512,2048,256,8,0,0,asm,10008+20000,349.8501
gfx936,int8_w8a8_channel,torch.float16,24,512,2048,256,8,0,0,asm,10009+20000,390.6669
gfx936,int8_w8a8_channel,torch.float16,28,512,2048,256,8,0,0,asm,10013+20000,454.9896
gfx936,int8_w8a8_channel,torch.float16,32,512,2048,256,8,0,0,asm,11005+20000,482.9222
gfx936,int8_w8a8_channel,torch.float16,34,512,2048,256,8,0,0,asm,11005+20000,468.7803
gfx936,int8_w8a8_channel,torch.float16,36,512,2048,256,8,0,0,asm,10011+20000,496.3086
gfx936,int8_w8a8_channel,torch.float16,40,512,2048,256,8,0,0,asm,10011+20000,516.0896
gfx936,int8_w8a8_channel,torch.float16,44,512,2048,256,8,0,0,asm,10011+20000,536.3675
gfx936,int8_w8a8_channel,torch.float16,48,512,2048,256,8,0,0,asm,10008+20000,550.9443
gfx936,int8_w8a8_channel,torch.float16,56,512,2048,256,8,0,0,asm,10011+20000,577.7821
gfx936,int8_w8a8_channel,torch.float16,64,512,2048,256,8,0,0,asm,10008+20000,597.9084
gfx936,int8_w8a8_channel,torch.float16,68,512,2048,256,8,0,0,asm,10008+20000,601.681
gfx936,int8_w8a8_channel,torch.float16,72,512,2048,256,8,0,0,asm,10008+20000,615.7609
gfx936,int8_w8a8_channel,torch.float16,80,512,2048,256,8,0,0,asm,10011+20000,636.763
gfx936,int8_w8a8_channel,torch.float16,88,512,2048,256,8,0,0,asm,10011+20000,650.9019
gfx936,int8_w8a8_channel,torch.float16,96,512,2048,256,8,0,0,asm,10011+20000,655.6177
gfx936,int8_w8a8_channel,torch.float16,104,512,2048,256,8,0,0,asm,10011+20000,665.5714
gfx936,int8_w8a8_channel,torch.float16,112,512,2048,256,8,0,0,asm,10011+20000,670.8683
gfx936,int8_w8a8_channel,torch.float16,128,512,2048,256,8,0,0,asm,10011+20000,685.1923
gfx936,int8_w8a8_channel,torch.float16,144,512,2048,256,8,0,0,asm,10008+20001,725.1081
gfx936,int8_w8a8_channel,torch.float16,160,512,2048,256,8,0,0,asm,10011+20000,696.4429
gfx936,int8_w8a8_channel,torch.float16,192,512,2048,256,8,0,0,asm,10011+20000,700.8724
gfx936,int8_w8a8_channel,torch.float16,224,512,2048,256,8,0,0,asm,10011+20000,711.4071
gfx936,int8_w8a8_channel,torch.float16,256,512,2048,256,8,0,0,asm,10011+20000,716.0387
gfx936,int8_w8a8_channel,torch.float16,320,512,2048,256,8,0,0,asm,11007+21000,774.5733
gfx936,int8_w8a8_channel,torch.float16,384,512,2048,256,8,0,0,asm,10011+20000,747.0113
gfx936,int8_w8a8_channel,torch.float16,448,512,2048,256,8,0,0,asm,11005+21000,773.7396
gfx936,int8_w8a8_channel,torch.float16,512,512,2048,256,8,0,0,asm,11005+21000,787.6848
gfx936,int8_w8a8_channel,torch.float16,576,512,2048,256,8,0,0,asm,12005+22000,861.8404
gfx936,int8_w8a8_channel,torch.float16,640,512,2048,256,8,0,0,asm,11005+21000,809.0491
gfx936,int8_w8a8_channel,torch.float16,704,512,2048,256,8,0,0,asm,11007+21000,826.4215
gfx936,int8_w8a8_channel,torch.float16,768,512,2048,256,8,0,0,asm,11005+21000,838.3794
gfx936,int8_w8a8_channel,torch.float16,832,512,2048,256,8,0,0,asm,11005+21000,849.7562
gfx936,int8_w8a8_channel,torch.float16,896,512,2048,256,8,0,0,asm,12005+22000,884.7455
gfx936,int8_w8a8_channel,torch.float16,960,512,2048,256,8,0,0,asm,12001+22000,867.4824
gfx936,int8_w8a8_channel,torch.float16,1024,512,2048,256,8,0,0,asm,12005+22000,888.8297
gfx936,int8_w8a8_channel,torch.float16,1152,512,2048,256,8,0,0,asm,12005+22000,941.4443
gfx936,int8_w8a8_channel,torch.float16,1280,512,2048,256,8,0,0,asm,12001+22000,928.9812
gfx936,int8_w8a8_channel,torch.float16,1408,512,2048,256,8,0,0,asm,12001+22000,947.9622
gfx936,int8_w8a8_channel,torch.float16,1536,512,2048,256,8,0,0,asm,12005+22000,980.9053
gfx936,int8_w8a8_channel,torch.float16,1664,512,2048,256,8,0,0,asm,12001+22000,999.9199
gfx936,int8_w8a8_channel,torch.float16,1792,512,2048,256,8,0,0,asm,12001+22000,1022.5051
gfx936,int8_w8a8_channel,torch.float16,1920,512,2048,256,8,0,0,asm,12001+22000,1070.6567
gfx936,int8_w8a8_channel,torch.float16,2048,512,2048,256,8,0,0,asm,12001+22001,1153.3089
gfx936,int8_w8a8_channel,torch.float16,2304,512,2048,256,8,0,0,asm,13001+23001,1182.4963
gfx936,int8_w8a8_channel,torch.float16,2560,512,2048,256,8,0,0,asm,13001+23001,1224.7614
gfx936,int8_w8a8_channel,torch.float16,2816,512,2048,256,8,0,0,asm,13001+23001,1245.3172
gfx936,int8_w8a8_channel,torch.float16,3072,512,2048,256,8,0,0,asm,13001+23001,1278.1086
gfx936,int8_w8a8_channel,torch.float16,3328,512,2048,256,8,0,0,asm,13001+23001,1336.2475
gfx936,int8_w8a8_channel,torch.float16,3584,512,2048,256,8,0,0,asm,13001+23001,1381.6115
gfx936,int8_w8a8_channel,torch.float16,3840,512,2048,256,8,0,0,asm,13001+23001,1493.1565
gfx936,int8_w8a8_channel,torch.float16,4096,512,2048,256,8,0,0,asm,13001+23001,1657.3499
gfx936,int8_w8a8_channel,torch.float16,4608,512,2048,256,8,0,0,asm,13001+23001,1993.45
gfx936,int8_w8a8_channel,torch.float16,5120,512,2048,256,8,0,0,asm,12001+22001,2101.3907
gfx936,int8_w8a8_channel,torch.float16,5632,512,2048,256,8,0,0,asm,13001+23001,2184.7083
gfx936,int8_w8a8_channel,torch.float16,6144,512,2048,256,8,0,0,asm,13001+23001,2237.3144
gfx936,int8_w8a8_channel,torch.float16,6656,512,2048,256,8,0,0,asm,13001+23001,2297.6341
gfx936,int8_w8a8_channel,torch.float16,7168,512,2048,256,8,0,0,asm,13001+23001,2372.7578
gfx936,int8_w8a8_channel,torch.float16,7680,512,2048,256,8,0,0,asm,13001+23001,2503.2248
gfx936,int8_w8a8_channel,torch.float16,8192,512,2048,256,8,0,0,asm,13001+23001,2796.6807
gfx936,int8_w8a8_channel,torch.float16,10240,512,2048,256,8,0,0,asm,13001+23001,3349.3024
gfx936,int8_w8a8_channel,torch.float16,12288,512,2048,256,8,0,0,asm,13001+23001,3896.5094
gfx936,int8_w8a8_channel,torch.float16,14336,512,2048,256,8,0,0,asm,13001+23001,4486.15
gfx936,int8_w8a8_channel,torch.float16,16384,512,2048,256,8,0,0,asm,13001+23001,5067.0073
gfx936,int8_w8a8_channel,torch.float16,17408,512,2048,256,8,0,0,asm,13001+23001,5469.187
gfx936,int8_w8a8_channel,torch.float16,24576,512,2048,256,8,0,0,asm,13001+23001,7372.5919
gfx936,int8_w8a8_channel,torch.float16,32768,512,2048,256,8,0,0,asm,13001+23001,11965.4224
gfx936,int8_w8a8_channel,torch.float16,40960,512,2048,256,8,0,0,asm,13001+23001,14939.3742
gfx936,int8_w8a8_channel,torch.float16,49152,512,2048,256,8,0,0,asm,13001+23001,17880.1629
gfx936,int8_w8a8_channel,torch.float16,57344,512,2048,256,8,0,0,asm,13001+23001,20746.1576
gfx936,int8_w8a8_channel,torch.float16,65536,512,2048,256,8,0,0,asm,13001+22001,24014.33
gfx936,int8_w8a8_channel,torch.float16,65536,512,2048,256,8,0,0,asm,13001+23001,23307.1407
gfx936,int8_w8a8_channel,torch.bfloat16,1,512,2048,256,8,0,0,asm,10010+20000,66.1234
gfx936,int8_w8a8_channel,torch.bfloat16,2,512,2048,256,8,0,0,asm,10013+20000,87.4538
gfx936,int8_w8a8_channel,torch.bfloat16,3,512,2048,256,8,0,0,asm,10010+20000,114.2326
gfx936,int8_w8a8_channel,torch.bfloat16,4,512,2048,256,8,0,0,asm,10013+20000,130.999
gfx936,int8_w8a8_channel,torch.bfloat16,5,512,2048,256,8,0,0,asm,10013+20000,155.5547
gfx936,int8_w8a8_channel,torch.bfloat16,6,512,2048,256,8,0,0,asm,10013+20000,167.4367
gfx936,int8_w8a8_channel,torch.bfloat16,7,512,2048,256,8,0,0,asm,10013+20000,202.2999
gfx936,int8_w8a8_channel,torch.bfloat16,8,512,2048,256,8,0,0,asm,10013+20000,208.5902
gfx936,int8_w8a8_channel,torch.bfloat16,9,512,2048,256,8,0,0,asm,10013+20000,226.6451
gfx936,int8_w8a8_channel,torch.bfloat16,10,512,2048,256,8,0,0,asm,10013+20000,253.2217
gfx936,int8_w8a8_channel,torch.bfloat16,11,512,2048,256,8,0,0,asm,10013+20000,260.186
gfx936,int8_w8a8_channel,torch.bfloat16,12,512,2048,256,8,0,0,asm,10013+20000,270.1901
gfx936,int8_w8a8_channel,torch.bfloat16,13,512,2048,256,8,0,0,asm,10013+20000,296.2028
gfx936,int8_w8a8_channel,torch.bfloat16,14,512,2048,256,8,0,0,asm,10013+20000,309.7185
gfx936,int8_w8a8_channel,torch.bfloat16,15,512,2048,256,8,0,0,asm,10013+20000,318.2238
gfx936,int8_w8a8_channel,torch.bfloat16,16,512,2048,256,8,0,0,asm,10013+20000,340.7668
gfx936,int8_w8a8_channel,torch.bfloat16,17,512,2048,256,8,0,0,asm,10013+20000,335.9133
gfx936,int8_w8a8_channel,torch.bfloat16,18,512,2048,256,8,0,0,asm,10013+20000,342.3722
gfx936,int8_w8a8_channel,torch.bfloat16,20,512,2048,256,8,0,0,asm,10011+20000,377.9258
gfx936,int8_w8a8_channel,torch.bfloat16,24,512,2048,256,8,0,0,asm,10011+20000,422.1447
gfx936,int8_w8a8_channel,torch.bfloat16,28,512,2048,256,8,0,0,asm,10013+20000,491.0148
gfx936,int8_w8a8_channel,torch.bfloat16,32,512,2048,256,8,0,0,asm,10011+20000,519.6043
gfx936,int8_w8a8_channel,torch.bfloat16,34,512,2048,256,8,0,0,asm,10011+20000,508.8813
gfx936,int8_w8a8_channel,torch.bfloat16,36,512,2048,256,8,0,0,asm,10011+20000,523.6266
gfx936,int8_w8a8_channel,torch.bfloat16,40,512,2048,256,8,0,0,asm,10011+20000,552.1233
gfx936,int8_w8a8_channel,torch.bfloat16,44,512,2048,256,8,0,0,asm,10011+20000,567.8369
gfx936,int8_w8a8_channel,torch.bfloat16,48,512,2048,256,8,0,0,asm,10011+20000,582.3801
gfx936,int8_w8a8_channel,torch.bfloat16,56,512,2048,256,8,0,0,asm,10011+20000,605.1421
gfx936,int8_w8a8_channel,torch.bfloat16,64,512,2048,256,8,0,0,asm,10011+20000,631.66
gfx936,int8_w8a8_channel,torch.bfloat16,68,512,2048,256,8,0,0,asm,10011+20000,638.2958
gfx936,int8_w8a8_channel,torch.bfloat16,72,512,2048,256,8,0,0,asm,10011+20000,648.8725
gfx936,int8_w8a8_channel,torch.bfloat16,80,512,2048,256,8,0,0,asm,10011+20000,670.4557
gfx936,int8_w8a8_channel,torch.bfloat16,88,512,2048,256,8,0,0,asm,10011+20000,685.4114
gfx936,int8_w8a8_channel,torch.bfloat16,96,512,2048,256,8,0,0,asm,10011+20000,690.6072
gfx936,int8_w8a8_channel,torch.bfloat16,104,512,2048,256,8,0,0,asm,10011+20000,682.2114
gfx936,int8_w8a8_channel,torch.bfloat16,112,512,2048,256,8,0,0,asm,10011+20000,707.9545
gfx936,int8_w8a8_channel,torch.bfloat16,128,512,2048,256,8,0,0,asm,10011+20000,720.4093
gfx936,int8_w8a8_channel,torch.bfloat16,144,512,2048,256,8,0,0,asm,10011+20000,711.2387
gfx936,int8_w8a8_channel,torch.bfloat16,160,512,2048,256,8,0,0,asm,10011+20000,730.3039
gfx936,int8_w8a8_channel,torch.bfloat16,192,512,2048,256,8,0,0,asm,10011+20000,739.9207
gfx936,int8_w8a8_channel,torch.bfloat16,224,512,2048,256,8,0,0,asm,10012+20000,746.9103
gfx936,int8_w8a8_channel,torch.bfloat16,256,512,2048,256,8,0,0,asm,10011+20000,755.8366
gfx936,int8_w8a8_channel,torch.bfloat16,320,512,2048,256,8,0,0,asm,10011+20000,780.4092
gfx936,int8_w8a8_channel,torch.bfloat16,384,512,2048,256,8,0,0,asm,10011+20000,794.2703
gfx936,int8_w8a8_channel,torch.bfloat16,448,512,2048,256,8,0,0,asm,11005+21000,800.1734
gfx936,int8_w8a8_channel,torch.bfloat16,512,512,2048,256,8,0,0,asm,11005+21000,841.2765
gfx936,int8_w8a8_channel,torch.bfloat16,576,512,2048,256,8,0,0,asm,11007+21000,863.8701
gfx936,int8_w8a8_channel,torch.bfloat16,640,512,2048,256,8,0,0,asm,11005+21000,836.1818
gfx936,int8_w8a8_channel,torch.bfloat16,704,512,2048,256,8,0,0,asm,11005+21000,858.7079
gfx936,int8_w8a8_channel,torch.bfloat16,768,512,2048,256,8,0,0,asm,11005+21000,897.6973
gfx936,int8_w8a8_channel,torch.bfloat16,832,512,2048,256,8,0,0,asm,11005+21000,929.8656
gfx936,int8_w8a8_channel,torch.bfloat16,896,512,2048,256,8,0,0,asm,12001+22000,944.9813
gfx936,int8_w8a8_channel,torch.bfloat16,960,512,2048,256,8,0,0,asm,12001+22000,955.5665
gfx936,int8_w8a8_channel,torch.bfloat16,1024,512,2048,256,8,0,0,asm,12001+22000,965.9664
gfx936,int8_w8a8_channel,torch.bfloat16,1152,512,2048,256,8,0,0,asm,12005+22000,1028.9894
gfx936,int8_w8a8_channel,torch.bfloat16,1280,512,2048,256,8,0,0,asm,12001+22000,999.2968
gfx936,int8_w8a8_channel,torch.bfloat16,1408,512,2048,256,8,0,0,asm,12001+22001,1032.0463
gfx936,int8_w8a8_channel,torch.bfloat16,1536,512,2048,256,8,0,0,asm,12001+22000,1052.8631
gfx936,int8_w8a8_channel,torch.bfloat16,1664,512,2048,256,8,0,0,asm,12001+22001,1086.3956
gfx936,int8_w8a8_channel,torch.bfloat16,1792,512,2048,256,8,0,0,asm,12001+22001,1132.8207
gfx936,int8_w8a8_channel,torch.bfloat16,1920,512,2048,256,8,0,0,asm,12001+22001,1190.7238
gfx936,int8_w8a8_channel,torch.bfloat16,2048,512,2048,256,8,0,0,asm,12005+22001,1308.5508
gfx936,int8_w8a8_channel,torch.bfloat16,2304,512,2048,256,8,0,0,asm,13001+23001,1372.2055
gfx936,int8_w8a8_channel,torch.bfloat16,2560,512,2048,256,8,0,0,asm,13001+23001,1401.258
gfx936,int8_w8a8_channel,torch.bfloat16,2816,512,2048,256,8,0,0,asm,13001+23001,1442.5548
gfx936,int8_w8a8_channel,torch.bfloat16,3072,512,2048,256,8,0,0,asm,13001+23001,1476.5842
gfx936,int8_w8a8_channel,torch.bfloat16,3328,512,2048,256,8,0,0,asm,13001+23001,1541.9145
gfx936,int8_w8a8_channel,torch.bfloat16,3584,512,2048,256,8,0,0,asm,13001+23001,1587.8681
gfx936,int8_w8a8_channel,torch.bfloat16,3840,512,2048,256,8,0,0,asm,13001+23001,1753.4172
gfx936,int8_w8a8_channel,torch.bfloat16,4096,512,2048,256,8,0,0,asm,13001+23001,2012.5745
gfx936,int8_w8a8_channel,torch.bfloat16,4608,512,2048,256,8,0,0,asm,12001+22001,2424.8344
gfx936,int8_w8a8_channel,torch.bfloat16,5120,512,2048,256,8,0,0,asm,12001+22001,2563.3436
gfx936,int8_w8a8_channel,torch.bfloat16,5632,512,2048,256,8,0,0,asm,12001+22001,2694.5769
gfx936,int8_w8a8_channel,torch.bfloat16,6144,512,2048,256,8,0,0,asm,13001+23001,2759.1072
gfx936,int8_w8a8_channel,torch.bfloat16,6656,512,2048,256,8,0,0,asm,13001+23001,2816.3026
gfx936,int8_w8a8_channel,torch.bfloat16,7168,512,2048,256,8,0,0,asm,13001+23001,2898.4665
gfx936,int8_w8a8_channel,torch.bfloat16,7680,512,2048,256,8,0,0,asm,13001+23001,3065.6827
gfx936,int8_w8a8_channel,torch.bfloat16,8192,512,2048,256,8,0,0,asm,13001+23001,3465.1172
gfx936,int8_w8a8_channel,torch.bfloat16,10240,512,2048,256,8,0,0,asm,13001+23001,4148.9045
gfx936,int8_w8a8_channel,torch.bfloat16,12288,512,2048,256,8,0,0,asm,13001+23001,4845.3656
gfx936,int8_w8a8_channel,torch.bfloat16,14336,512,2048,256,8,0,0,asm,13001+23001,5592.5893
gfx936,int8_w8a8_channel,torch.bfloat16,16384,512,2048,256,8,0,0,asm,13001+23001,6308.3012
gfx936,int8_w8a8_channel,torch.bfloat16,17408,512,2048,256,8,0,0,asm,13001+23001,6826.1859
gfx936,int8_w8a8_channel,torch.bfloat16,24576,512,2048,256,8,0,0,asm,13001+23001,9185.3035
gfx936,int8_w8a8_channel,torch.bfloat16,32768,512,2048,256,8,0,0,asm,13001+23001,12019.847
gfx936,int8_w8a8_channel,torch.bfloat16,40960,512,2048,256,8,0,0,asm,13001+23001,14983.3494
gfx936,int8_w8a8_channel,torch.bfloat16,49152,512,2048,256,8,0,0,asm,13001+23001,17938.1767
gfx936,int8_w8a8_channel,torch.bfloat16,57344,512,2048,256,8,0,0,asm,13001+23001,20809.5385
gfx936,int8_w8a8_channel,torch.bfloat16,65536,512,2048,256,8,0,0,asm,13001+23001,23354.4759
gfx936,int8_w8a8_channel,torch.bfloat16,65536,512,2048,256,8,0,0,asm,13001+23001,23370.3628
gfx938,int8_w8a8_channel,torch.bfloat16,1,512,4096,256,6,0,0,asm,10008+20000,78.0137
gfx938,int8_w8a8_channel,torch.bfloat16,2,512,4096,256,6,0,0,asm,10011+20000,104.7423
gfx938,int8_w8a8_channel,torch.bfloat16,3,512,4096,256,6,0,0,asm,10002+20000,148.4811
gfx938,int8_w8a8_channel,torch.bfloat16,4,512,4096,256,6,0,0,asm,10008+20000,174.1736
gfx938,int8_w8a8_channel,torch.bfloat16,5,512,4096,256,6,0,0,asm,10011+20000,193.5926
gfx938,int8_w8a8_channel,torch.bfloat16,6,512,4096,256,6,0,0,asm,10002+20000,246.7968
gfx938,int8_w8a8_channel,torch.bfloat16,7,512,4096,256,6,0,0,asm,10011+20000,268.0261
gfx938,int8_w8a8_channel,torch.bfloat16,8,512,4096,256,6,0,0,asm,10011+20000,281.0701
gfx938,int8_w8a8_channel,torch.bfloat16,9,512,4096,256,6,0,0,asm,10002+20000,336.7837
gfx938,int8_w8a8_channel,torch.bfloat16,10,512,4096,256,6,0,0,asm,10011+20000,349.3478
gfx938,int8_w8a8_channel,torch.bfloat16,11,512,4096,256,6,0,0,asm,10011+20000,362.3499
gfx938,int8_w8a8_channel,torch.bfloat16,12,512,4096,256,6,0,0,asm,10011+20000,368.5898
gfx938,int8_w8a8_channel,torch.bfloat16,13,512,4096,256,6,0,0,asm,10011+20000,384.8931
gfx938,int8_w8a8_channel,torch.bfloat16,14,512,4096,256,6,0,0,asm,10011+20000,435.6717
gfx938,int8_w8a8_channel,torch.bfloat16,15,512,4096,256,6,0,0,asm,10011+20000,443.6802
gfx938,int8_w8a8_channel,torch.bfloat16,16,512,4096,256,6,0,0,asm,10011+20000,449.9033
gfx938,int8_w8a8_channel,torch.bfloat16,17,512,4096,256,6,0,0,asm,10011+20000,456.8477
gfx938,int8_w8a8_channel,torch.bfloat16,18,512,4096,256,6,0,0,asm,10011+20000,500.16550000000007
gfx938,int8_w8a8_channel,torch.bfloat16,20,512,4096,256,6,0,0,asm,10011+20000,524.7801
gfx938,int8_w8a8_channel,torch.bfloat16,24,512,4096,256,6,0,0,asm,10011+20000,603.8881
gfx938,int8_w8a8_channel,torch.bfloat16,28,512,4096,256,6,0,0,asm,10011+20000,697.4143
gfx938,int8_w8a8_channel,torch.bfloat16,32,512,4096,256,6,0,0,asm,10011+20000,773.7342
gfx938,int8_w8a8_channel,torch.bfloat16,34,512,4096,256,6,0,0,asm,10011+20000,775.1206
gfx938,int8_w8a8_channel,torch.bfloat16,36,512,4096,256,6,0,0,asm,10011+20000,823.769
gfx938,int8_w8a8_channel,torch.bfloat16,40,512,4096,256,6,0,0,asm,10011+20000,865.4699
gfx938,int8_w8a8_channel,torch.bfloat16,44,512,4096,256,6,0,0,asm,10011+20000,937.0485
gfx938,int8_w8a8_channel,torch.bfloat16,48,512,4096,256,6,0,0,asm,10011+20000,988.0631
gfx938,int8_w8a8_channel,torch.bfloat16,56,512,4096,256,6,0,0,asm,10011+20000,1047.9872
gfx938,int8_w8a8_channel,torch.bfloat16,64,512,4096,256,6,0,0,asm,10011+20000,1105.1827
gfx938,int8_w8a8_channel,torch.bfloat16,68,512,4096,256,6,0,0,asm,10011+20000,1135.1615
gfx938,int8_w8a8_channel,torch.bfloat16,72,512,4096,256,6,0,0,asm,10011+20000,1172.1466
gfx938,int8_w8a8_channel,torch.bfloat16,80,512,4096,256,6,0,0,asm,10011+20000,1195.0182
gfx938,int8_w8a8_channel,torch.bfloat16,88,512,4096,256,6,0,0,asm,10011+20000,1236.5169
gfx938,int8_w8a8_channel,torch.bfloat16,96,512,4096,256,6,0,0,asm,10011+20000,1262.0663
gfx938,int8_w8a8_channel,torch.bfloat16,104,512,4096,256,6,0,0,asm,10011+20000,1281.8557
gfx938,int8_w8a8_channel,torch.bfloat16,112,512,4096,256,6,0,0,asm,10011+20000,1292.9798
gfx938,int8_w8a8_channel,torch.bfloat16,128,512,4096,256,6,0,0,asm,10011+20000,1330.4197
gfx938,int8_w8a8_channel,torch.bfloat16,144,512,4096,256,6,0,0,asm,10011+20000,1346.0662
gfx938,int8_w8a8_channel,torch.bfloat16,160,512,4096,256,6,0,0,asm,10011+20000,1386.0409
gfx938,int8_w8a8_channel,torch.bfloat16,192,512,4096,256,6,0,0,asm,10011+20000,1398.8493
gfx938,int8_w8a8_channel,torch.bfloat16,224,512,4096,256,6,0,0,asm,10011+20000,1416.8366
gfx938,int8_w8a8_channel,torch.bfloat16,256,512,4096,256,6,0,0,asm,10011+20000,1422.0661
gfx938,int8_w8a8_channel,torch.bfloat16,320,512,4096,256,6,0,0,asm,10011+20000,1439.3039
gfx938,int8_w8a8_channel,torch.bfloat16,384,512,4096,256,6,0,0,asm,11005+21000,1519.3457
gfx938,int8_w8a8_channel,torch.bfloat16,448,512,4096,256,6,0,0,asm,10011+20000,1480.4069
gfx938,int8_w8a8_channel,torch.bfloat16,512,512,4096,256,6,0,0,asm,11005+21000,1508.4069
gfx938,int8_w8a8_channel,torch.bfloat16,576,512,4096,256,6,0,0,asm,11005+21000,1582.9583
gfx938,int8_w8a8_channel,torch.bfloat16,640,512,4096,256,6,0,0,asm,11005+21000,1542.1839
gfx938,int8_w8a8_channel,torch.bfloat16,704,512,4096,256,6,0,0,asm,11005+21000,1553.5608
gfx938,int8_w8a8_channel,torch.bfloat16,768,512,4096,256,6,0,0,asm,11005+21000,1569.9736
gfx938,int8_w8a8_channel,torch.bfloat16,832,512,4096,256,6,0,0,asm,11005+21000,1596.6598
gfx938,int8_w8a8_channel,torch.bfloat16,896,512,4096,256,6,0,0,asm,11005+21000,1617.6449
gfx938,int8_w8a8_channel,torch.bfloat16,960,512,4096,256,6,0,0,asm,11005+21000,1632.1626
gfx938,int8_w8a8_channel,torch.bfloat16,1024,512,4096,256,6,0,0,asm,11005+21000,1645.7204
gfx938,int8_w8a8_channel,torch.bfloat16,1152,512,4096,256,6,0,0,asm,12001+22001,1750.0232
gfx938,int8_w8a8_channel,torch.bfloat16,1280,512,4096,256,6,0,0,asm,12001+22000,1721.5347
gfx938,int8_w8a8_channel,torch.bfloat16,1408,512,4096,256,6,0,0,asm,12001+22000,1741.7703
gfx938,int8_w8a8_channel,torch.bfloat16,1536,512,4096,256,6,0,0,asm,12001+22001,1789.3573
gfx938,int8_w8a8_channel,torch.bfloat16,1664,512,4096,256,6,0,0,asm,12001+22000,1804.4477
gfx938,int8_w8a8_channel,torch.bfloat16,1792,512,4096,256,6,0,0,asm,12001+22000,1817.0034
gfx938,int8_w8a8_channel,torch.bfloat16,1920,512,4096,256,6,0,0,asm,12001+22000,1861.1633
gfx938,int8_w8a8_channel,torch.bfloat16,2048,512,4096,256,6,0,0,asm,12001+22000,1889.8789
gfx938,int8_w8a8_channel,torch.bfloat16,2304,512,4096,256,6,0,0,asm,12001+22000,1970.5355
gfx938,int8_w8a8_channel,torch.bfloat16,2560,512,4096,256,6,0,0,asm,12001+22001,2170.8128
gfx938,int8_w8a8_channel,torch.bfloat16,2816,512,4096,256,6,0,0,asm,13001+23001,2504.1004
gfx938,int8_w8a8_channel,torch.bfloat16,3072,512,4096,256,6,0,0,asm,13001+23001,2511.8814
gfx938,int8_w8a8_channel,torch.bfloat16,3328,512,4096,256,6,0,0,asm,13001+23001,2566.197
gfx938,int8_w8a8_channel,torch.bfloat16,3584,512,4096,256,6,0,0,asm,13001+23001,2613.4642
gfx938,int8_w8a8_channel,torch.bfloat16,3840,512,4096,256,6,0,0,asm,13001+23001,2658.1883
gfx938,int8_w8a8_channel,torch.bfloat16,4096,512,4096,256,6,0,0,asm,13001+23001,2694.4411
gfx938,int8_w8a8_channel,torch.bfloat16,4608,512,4096,256,6,0,0,asm,13001+23001,2836.3946
gfx938,int8_w8a8_channel,torch.bfloat16,5120,512,4096,256,6,0,0,asm,13001+22001,3390.4479
gfx938,int8_w8a8_channel,torch.bfloat16,5632,512,4096,256,6,0,0,asm,13001+23001,4003.1115
gfx938,int8_w8a8_channel,torch.bfloat16,6144,512,4096,256,6,0,0,asm,12001+22001,4569.3499
gfx938,int8_w8a8_channel,torch.bfloat16,6656,512,4096,256,6,0,0,asm,12001+22001,4721.6274
gfx938,int8_w8a8_channel,torch.bfloat16,7168,512,4096,256,6,0,0,asm,12001+22001,4877.1131
gfx938,int8_w8a8_channel,torch.bfloat16,7680,512,4096,256,6,0,0,asm,12001+22001,5094.0808
gfx938,int8_w8a8_channel,torch.bfloat16,8192,512,4096,256,6,0,0,asm,13001+23001,5156.8006
gfx938,int8_w8a8_channel,torch.bfloat16,10240,512,4096,256,6,0,0,asm,13001+23001,5702.4324
gfx938,int8_w8a8_channel,torch.bfloat16,12288,512,4096,256,6,0,0,asm,13001+23001,7434.4938
gfx938,int8_w8a8_channel,torch.bfloat16,14336,512,4096,256,6,0,0,asm,13001+23001,7808.6131
gfx938,int8_w8a8_channel,torch.bfloat16,16384,512,4096,256,6,0,0,asm,13001+23001,8942.8737
gfx938,int8_w8a8_channel,torch.bfloat16,17408,512,4096,256,6,0,0,asm,13001+23001,9827.4874
gfx938,int8_w8a8_channel,torch.bfloat16,24576,512,4096,256,6,0,0,asm,13001+23001,12834.5617
gfx938,int8_w8a8_channel,torch.bfloat16,32768,512,4096,256,6,0,0,asm,13001+23001,16783.0803
gfx938,int8_w8a8_channel,torch.bfloat16,40960,512,4096,256,6,0,0,asm,13001+23001,20690.7507
gfx938,int8_w8a8_channel,torch.bfloat16,49152,512,4096,256,6,0,0,asm,13001+23001,24586.4703
gfx938,int8_w8a8_channel,torch.bfloat16,57344,512,4096,256,6,0,0,asm,13001+23001,28541.35
gfx938,int8_w8a8_channel,torch.bfloat16,65536,512,4096,256,6,0,0,asm,13001+23001,32417.998199999998
gfx938,int8_w8a8_channel,torch.bfloat16,1,256,4096,256,6,0,0,asm,10002+20001,59.4876
gfx938,int8_w8a8_channel,torch.bfloat16,2,256,4096,256,6,0,0,asm,10008+20000,79.9253
gfx938,int8_w8a8_channel,torch.bfloat16,3,256,4096,256,6,0,0,asm,10011+20000,102.9991
gfx938,int8_w8a8_channel,torch.bfloat16,4,256,4096,256,6,0,0,asm,10011+20000,109.1716
gfx938,int8_w8a8_channel,torch.bfloat16,5,256,4096,256,6,0,0,asm,10011+20001,121.6347
gfx938,int8_w8a8_channel,torch.bfloat16,6,256,4096,256,6,0,0,asm,10002+20001,154.6619
gfx938,int8_w8a8_channel,torch.bfloat16,7,256,4096,256,6,0,0,asm,10008+20000,164.1441
gfx938,int8_w8a8_channel,torch.bfloat16,8,256,4096,256,6,0,0,asm,10008+20001,171.8326
gfx938,int8_w8a8_channel,torch.bfloat16,9,256,4096,256,6,0,0,asm,10011+20001,189.1714
gfx938,int8_w8a8_channel,torch.bfloat16,10,256,4096,256,6,0,0,asm,10011+20000,194.6872
gfx938,int8_w8a8_channel,torch.bfloat16,11,256,4096,256,6,0,0,asm,10011+20001,200.6829
gfx938,int8_w8a8_channel,torch.bfloat16,12,256,4096,256,6,0,0,asm,10011+20001,208.1187
gfx938,int8_w8a8_channel,torch.bfloat16,13,256,4096,256,6,0,0,asm,10011+20000,215.2429
gfx938,int8_w8a8_channel,torch.bfloat16,14,256,4096,256,6,0,0,asm,10002+20000,256.8849
gfx938,int8_w8a8_channel,torch.bfloat16,15,256,4096,256,6,0,0,asm,10008+20000,262.7795
gfx938,int8_w8a8_channel,torch.bfloat16,16,256,4096,256,6,0,0,asm,10008+20000,264.8932
gfx938,int8_w8a8_channel,torch.bfloat16,17,256,4096,256,6,0,0,asm,10011+20000,259.7367
gfx938,int8_w8a8_channel,torch.bfloat16,18,256,4096,256,6,0,0,asm,10011+20000,260.9324
gfx938,int8_w8a8_channel,torch.bfloat16,20,256,4096,256,6,0,0,asm,10011+20000,276.4524
gfx938,int8_w8a8_channel,torch.bfloat16,24,256,4096,256,6,0,0,asm,10011+20000,338.6502
gfx938,int8_w8a8_channel,torch.bfloat16,28,256,4096,256,6,0,0,asm,10011+20000,374.6445
gfx938,int8_w8a8_channel,torch.bfloat16,32,256,4096,256,6,0,0,asm,10011+20000,434.6781
gfx938,int8_w8a8_channel,torch.bfloat16,34,256,4096,256,6,0,0,asm,10011+20000,419.3826
gfx938,int8_w8a8_channel,torch.bfloat16,36,256,4096,256,6,0,0,asm,10011+20000,429.2436
gfx938,int8_w8a8_channel,torch.bfloat16,40,256,4096,256,6,0,0,asm,10011+20000,451.4667
gfx938,int8_w8a8_channel,torch.bfloat16,44,256,4096,256,6,0,0,asm,10011+20000,508.71270000000004
gfx938,int8_w8a8_channel,torch.bfloat16,48,256,4096,256,6,0,0,asm,10011+20000,514.7422
gfx938,int8_w8a8_channel,torch.bfloat16,56,256,4096,256,6,0,0,asm,10011+20000,538.018
gfx938,int8_w8a8_channel,torch.bfloat16,64,256,4096,256,6,0,0,asm,10011+20000,586.3377
gfx938,int8_w8a8_channel,torch.bfloat16,68,256,4096,256,6,0,0,asm,10011+20000,592.6621
gfx938,int8_w8a8_channel,torch.bfloat16,72,256,4096,256,6,0,0,asm,10011+20000,598.0852
gfx938,int8_w8a8_channel,torch.bfloat16,80,256,4096,256,6,0,0,asm,10011+20000,606.6492
gfx938,int8_w8a8_channel,torch.bfloat16,88,256,4096,256,6,0,0,asm,10011+20000,653.2175
gfx938,int8_w8a8_channel,torch.bfloat16,96,256,4096,256,6,0,0,asm,10011+20000,659.6933
gfx938,int8_w8a8_channel,torch.bfloat16,104,256,4096,256,6,0,0,asm,10011+20000,666.8344
gfx938,int8_w8a8_channel,torch.bfloat16,112,256,4096,256,6,0,0,asm,10011+20000,670.4216
gfx938,int8_w8a8_channel,torch.bfloat16,128,256,4096,256,6,0,0,asm,10011+20000,679.7943
gfx938,int8_w8a8_channel,torch.bfloat16,144,256,4096,256,6,0,0,asm,10011+20000,686.0091
gfx938,int8_w8a8_channel,torch.bfloat16,160,256,4096,256,6,0,0,asm,10011+20001,731.4153
gfx938,int8_w8a8_channel,torch.bfloat16,192,256,4096,256,6,0,0,asm,10011+20000,713.9669
gfx938,int8_w8a8_channel,torch.bfloat16,224,256,4096,256,6,0,0,asm,10011+20000,721.7312
gfx938,int8_w8a8_channel,torch.bfloat16,256,256,4096,256,6,0,0,asm,10011+20000,729.8406
gfx938,int8_w8a8_channel,torch.bfloat16,320,256,4096,256,6,0,0,asm,10011+20000,749.0827
gfx938,int8_w8a8_channel,torch.bfloat16,384,256,4096,256,6,0,0,asm,10011+20000,765.891
gfx938,int8_w8a8_channel,torch.bfloat16,448,256,4096,256,6,0,0,asm,11005+21001,786.75
gfx938,int8_w8a8_channel,torch.bfloat16,512,256,4096,256,6,0,0,asm,10011+20001,804.5941
gfx938,int8_w8a8_channel,torch.bfloat16,576,256,4096,256,6,0,0,asm,11005+21001,882.0003
gfx938,int8_w8a8_channel,torch.bfloat16,640,256,4096,256,6,0,0,asm,11005+21001,832.0973
gfx938,int8_w8a8_channel,torch.bfloat16,704,256,4096,256,6,0,0,asm,11005+21001,843.4405
gfx938,int8_w8a8_channel,torch.bfloat16,768,256,4096,256,6,0,0,asm,11003+21001,908.7708
gfx938,int8_w8a8_channel,torch.bfloat16,832,256,4096,256,6,0,0,asm,11005+21001,868.4593
gfx938,int8_w8a8_channel,torch.bfloat16,896,256,4096,256,6,0,0,asm,11005+21001,914.3624
gfx938,int8_w8a8_channel,torch.bfloat16,960,256,4096,256,6,0,0,asm,11005+21001,924.1559
gfx938,int8_w8a8_channel,torch.bfloat16,1024,256,4096,256,6,0,0,asm,11005+21001,928.5264
gfx938,int8_w8a8_channel,torch.bfloat16,1152,256,4096,256,6,0,0,asm,11005+21000,1027.9198
gfx938,int8_w8a8_channel,torch.bfloat16,1280,256,4096,256,6,0,0,asm,12001+22001,1040.2229
gfx938,int8_w8a8_channel,torch.bfloat16,1408,256,4096,256,6,0,0,asm,12003+22001,1065.9996
gfx938,int8_w8a8_channel,torch.bfloat16,1536,256,4096,256,6,0,0,asm,12001+22001,1136.1046
gfx938,int8_w8a8_channel,torch.bfloat16,1664,256,4096,256,6,0,0,asm,12005+22001,1125.452
gfx938,int8_w8a8_channel,torch.bfloat16,1792,256,4096,256,6,0,0,asm,12001+22001,1115.6584
gfx938,int8_w8a8_channel,torch.bfloat16,1920,256,4096,256,6,0,0,asm,12001+22001,1177.1741
gfx938,int8_w8a8_channel,torch.bfloat16,2048,256,4096,256,6,0,0,asm,12005+22001,1201.3592
gfx938,int8_w8a8_channel,torch.bfloat16,2304,256,4096,256,6,0,0,asm,12001+22001,1261.3253
gfx938,int8_w8a8_channel,torch.bfloat16,2560,256,4096,256,6,0,0,asm,12001+22001,1404.8618
gfx938,int8_w8a8_channel,torch.bfloat16,2816,256,4096,256,6,0,0,asm,12001+22001,1625.1307
gfx938,int8_w8a8_channel,torch.bfloat16,3072,256,4096,256,6,0,0,asm,13001+23001,1689.2738
gfx938,int8_w8a8_channel,torch.bfloat16,3328,256,4096,256,6,0,0,asm,13001+23001,1716.6084
gfx938,int8_w8a8_channel,torch.bfloat16,3584,256,4096,256,6,0,0,asm,13001+23001,1742.6125
gfx938,int8_w8a8_channel,torch.bfloat16,3840,256,4096,256,6,0,0,asm,13001+23001,1771.8165
gfx938,int8_w8a8_channel,torch.bfloat16,4096,256,4096,256,6,0,0,asm,13001+23001,1793.0459
gfx938,int8_w8a8_channel,torch.bfloat16,4608,256,4096,256,6,0,0,asm,13001+23001,1943.1845
gfx938,int8_w8a8_channel,torch.bfloat16,5120,256,4096,256,6,0,0,asm,13001+23001,2196.4471
gfx938,int8_w8a8_channel,torch.bfloat16,5632,256,4096,256,6,0,0,asm,13001+23001,2693.1279
gfx938,int8_w8a8_channel,torch.bfloat16,6144,256,4096,256,6,0,0,asm,12001+22001,3018.0198
gfx938,int8_w8a8_channel,torch.bfloat16,6656,256,4096,256,6,0,0,asm,12001+22001,3114.0279
gfx938,int8_w8a8_channel,torch.bfloat16,7168,256,4096,256,6,0,0,asm,12001+22001,3249.5137
gfx938,int8_w8a8_channel,torch.bfloat16,7680,256,4096,256,6,0,0,asm,12001+22001,3395.9636
gfx938,int8_w8a8_channel,torch.bfloat16,8192,256,4096,256,6,0,0,asm,13001+23001,3485.6473
gfx938,int8_w8a8_channel,torch.bfloat16,10240,256,4096,256,6,0,0,asm,13001+23001,3930.8417
gfx938,int8_w8a8_channel,torch.bfloat16,12288,256,4096,256,6,0,0,asm,13001+23001,5137.4904
gfx938,int8_w8a8_channel,torch.bfloat16,14336,256,4096,256,6,0,0,asm,13001+23001,5479.5268
gfx938,int8_w8a8_channel,torch.bfloat16,16384,256,4096,256,6,0,0,asm,13001+23001,6239.061
gfx938,int8_w8a8_channel,torch.bfloat16,17408,256,4096,256,6,0,0,asm,13001+23001,6896.6919
gfx938,int8_w8a8_channel,torch.bfloat16,24576,256,4096,256,6,0,0,asm,13001+23001,8995.4132
gfx938,int8_w8a8_channel,torch.bfloat16,32768,256,4096,256,6,0,0,asm,13001+23001,11788.1793
gfx938,int8_w8a8_channel,torch.bfloat16,40960,256,4096,256,6,0,0,asm,13001+23000,14816.1534
gfx938,int8_w8a8_channel,torch.bfloat16,49152,256,4096,256,6,0,0,asm,13001+23001,17277.1915
gfx938,int8_w8a8_channel,torch.bfloat16,57344,256,4096,256,6,0,0,asm,13001+23001,20062.2133
gfx938,int8_w8a8_channel,torch.bfloat16,65536,256,4096,256,6,0,0,asm,13001+23001,22777.5632
gfx938,int8_w8a8_channel,torch.bfloat16,65536,256,4096,256,6,0,0,asm,13001+23001,22900.0209
\ No newline at end of file
......@@ -1268,3 +1268,72 @@ gfx938,f8_w8a8_block,torch.float16,14336,256,4096,256,8,0,0,asm,13001+23000,5030
gfx938,f8_w8a8_block,torch.float16,16384,256,4096,256,8,0,0,asm,13001+23000,5608.4473
gfx938,f8_w8a8_block,torch.float16,17408,256,4096,256,8,0,0,asm,13001+23000,6038.0465
gfx938,f8_w8a8_block,torch.float16,24576,256,4096,256,8,0,0,asm,13001+23000,8143.5178
gfx938,f8_w8a8_block,torch.float16,1,512,4096,256,8,0,0,asm,10007+20000,82.9571
gfx938,f8_w8a8_block,torch.float16,2,512,4096,256,8,0,0,asm,10001+20000,112.2287
gfx938,f8_w8a8_block,torch.float16,4,512,4096,256,8,0,0,asm,10002+20000,168.5064
gfx938,f8_w8a8_block,torch.float16,6,512,4096,256,8,0,0,asm,10002+20000,218.1063
gfx938,f8_w8a8_block,torch.float16,8,512,4096,256,8,0,0,asm,10002+20000,257.9547
gfx938,f8_w8a8_block,torch.float16,10,512,4096,256,8,0,0,asm,10002+20000,302.8811
gfx938,f8_w8a8_block,torch.float16,12,512,4096,256,8,0,0,asm,10002+20000,333.6432
gfx938,f8_w8a8_block,torch.float16,14,512,4096,256,8,0,0,asm,10002+20000,373.4999
gfx938,f8_w8a8_block,torch.float16,16,512,4096,256,8,0,0,asm,10002+20000,393.4746
gfx938,f8_w8a8_block,torch.float16,20,512,4096,256,8,0,0,asm,10002+20000,455.5854
gfx938,f8_w8a8_block,torch.float16,24,512,4096,256,8,0,0,asm,10001+20000,514.044
gfx938,f8_w8a8_block,torch.float16,28,512,4096,256,8,0,0,asm,10002+20000,582.8807
gfx938,f8_w8a8_block,torch.float16,32,512,4096,256,8,0,0,asm,10002+20000,618.847
gfx938,f8_w8a8_block,torch.float16,36,512,4096,256,8,0,0,asm,10002+20000,647.2482
gfx938,f8_w8a8_block,torch.float16,40,512,4096,256,8,0,0,asm,10001+20000,676.0396
gfx938,f8_w8a8_block,torch.float16,44,512,4096,256,8,0,0,asm,10002+20000,707.8039
gfx938,f8_w8a8_block,torch.float16,48,512,4096,256,8,0,0,asm,10002+20000,718.1197
gfx938,f8_w8a8_block,torch.float16,56,512,4096,256,8,0,0,asm,10001+20000,757.1511
gfx938,f8_w8a8_block,torch.float16,64,512,4096,256,8,0,0,asm,10002+20000,781.3533
gfx938,f8_w8a8_block,torch.float16,80,512,4096,256,8,0,0,asm,10002+20000,834.0691
gfx938,f8_w8a8_block,torch.float16,96,512,4096,256,8,0,0,asm,10002+20000,871.6102
gfx938,f8_w8a8_block,torch.float16,112,512,4096,256,8,0,0,asm,10002+20000,885.1175
gfx938,f8_w8a8_block,torch.float16,128,512,4096,256,8,0,0,asm,10002+20000,903.3996
gfx938,f8_w8a8_block,torch.float16,160,512,4096,256,8,0,0,asm,10002+20000,918.0774
gfx938,f8_w8a8_block,torch.float16,192,512,4096,256,8,0,0,asm,10001+20000,936.3676
gfx938,f8_w8a8_block,torch.float16,224,512,4096,256,8,0,0,asm,10002+20000,943.6689
gfx938,f8_w8a8_block,torch.float16,256,512,4096,256,8,0,0,asm,10002+20000,951.9721
gfx938,f8_w8a8_block,torch.float16,320,512,4096,256,8,0,0,asm,10002+20000,969.4878
gfx938,f8_w8a8_block,torch.float16,384,512,4096,256,8,0,0,asm,10006+20000,1018.5991999999999
gfx938,f8_w8a8_block,torch.float16,448,512,4096,256,8,0,0,asm,11008+21000,1043.5172
gfx938,f8_w8a8_block,torch.float16,512,512,4096,256,8,0,0,asm,11010+21000,1063.8287
gfx938,f8_w8a8_block,torch.float16,576,512,4096,256,8,0,0,asm,11010+21000,1103.7024
gfx938,f8_w8a8_block,torch.float16,640,512,4096,256,8,0,0,asm,11007+21000,1088.9825
gfx938,f8_w8a8_block,torch.float16,704,512,4096,256,8,0,0,asm,11010+21000,1061.7066
gfx938,f8_w8a8_block,torch.float16,768,512,4096,256,8,0,0,asm,11010+21000,1106.7677
gfx938,f8_w8a8_block,torch.float16,832,512,4096,256,8,0,0,asm,11010+21000,1127.4246
gfx938,f8_w8a8_block,torch.float16,896,512,4096,256,8,0,0,asm,11010+21000,1168.4432
gfx938,f8_w8a8_block,torch.float16,960,512,4096,256,8,0,0,asm,11010+21000,1173.7484
gfx938,f8_w8a8_block,torch.float16,1024,512,4096,256,8,0,0,asm,11010+21000,1219.298
gfx938,f8_w8a8_block,torch.float16,1152,512,4096,256,8,0,0,asm,12002+22000,1338.2958
gfx938,f8_w8a8_block,torch.float16,1280,512,4096,256,8,0,0,asm,12003+22000,1279.3149
gfx938,f8_w8a8_block,torch.float16,1408,512,4096,256,8,0,0,asm,12003+22000,1309.1085
gfx938,f8_w8a8_block,torch.float16,1536,512,4096,256,8,0,0,asm,12002+22000,1346.0515
gfx938,f8_w8a8_block,torch.float16,1664,512,4096,256,8,0,0,asm,12003+22000,1330.7168
gfx938,f8_w8a8_block,torch.float16,1792,512,4096,256,8,0,0,asm,12002+22000,1433.3104
gfx938,f8_w8a8_block,torch.float16,1920,512,4096,256,8,0,0,asm,12004+22000,1557.4367
gfx938,f8_w8a8_block,torch.float16,2048,512,4096,256,8,0,0,asm,12003+22000,1661.2428
gfx938,f8_w8a8_block,torch.float16,2304,512,4096,256,8,0,0,asm,12005+22000,1940.813
gfx938,f8_w8a8_block,torch.float16,2560,512,4096,256,8,0,0,asm,13001+22000,2072.7203
gfx938,f8_w8a8_block,torch.float16,2816,512,4096,256,8,0,0,asm,13001+22000,2095.2974
gfx938,f8_w8a8_block,torch.float16,3072,512,4096,256,8,0,0,asm,13001+22000,2144.6107
gfx938,f8_w8a8_block,torch.float16,3328,512,4096,256,8,0,0,asm,13001+22000,2175.5328
gfx938,f8_w8a8_block,torch.float16,3584,512,4096,256,8,0,0,asm,13001+22000,2234.2864
gfx938,f8_w8a8_block,torch.float16,3840,512,4096,256,8,0,0,asm,12005+22000,2454.0841
gfx938,f8_w8a8_block,torch.float16,4096,512,4096,256,8,0,0,asm,12005+22000,2680.5851
gfx938,f8_w8a8_block,torch.float16,4608,512,4096,256,8,0,0,asm,12005+22000,3087.0188
gfx938,f8_w8a8_block,torch.float16,5120,512,4096,256,8,0,0,asm,12005+22000,3226.8833
gfx938,f8_w8a8_block,torch.float16,5632,512,4096,256,8,0,0,asm,12005+22000,3405.6863
gfx938,f8_w8a8_block,torch.float16,6144,512,4096,256,8,0,0,asm,12005+22000,3731.7076
gfx938,f8_w8a8_block,torch.float16,6656,512,4096,256,8,0,0,asm,13001+23000,3931.2194
gfx938,f8_w8a8_block,torch.float16,7168,512,4096,256,8,0,0,asm,13001+23000,4010.6804
gfx938,f8_w8a8_block,torch.float16,7680,512,4096,256,8,0,0,asm,13001+23000,4195.0509
gfx938,f8_w8a8_block,torch.float16,8192,512,4096,256,8,0,0,asm,13001+23000,4642.5692
gfx938,f8_w8a8_block,torch.float16,10240,512,4096,256,8,0,0,asm,13001+23000,5698.2073
gfx938,f8_w8a8_block,torch.float16,12288,512,4096,256,8,0,0,asm,13001+23000,6601.8277
gfx938,f8_w8a8_block,torch.float16,14336,512,4096,256,8,0,0,asm,13001+23000,7572.0575
gfx938,f8_w8a8_block,torch.float16,16384,512,4096,256,8,0,0,asm,13001+23000,8551.2295
gfx938,f8_w8a8_block,torch.float16,17408,512,4096,256,8,0,0,asm,13001+23000,9230.6924
gfx938,f8_w8a8_block,torch.float16,24576,512,4096,256,8,0,0,asm,13001+23000,12357.5934
\ No newline at end of file
......@@ -19,26 +19,78 @@ from typing import Any, Dict, Optional, Union
import torch
import torch.distributed
from .parallel_state import get_tp_group
from .parallel_state import get_tp_group, get_custom_group, has_custom_group
def tensor_model_parallel_all_reduce(
input_: torch.Tensor, open_fp8_quant: bool = False
input_: torch.Tensor,
use_new: bool = True,
open_fp8_quant: bool = False,
prefill_support: bool = False,
) -> torch.Tensor:
"""All-reduce the input tensor across model parallel group."""
return get_tp_group().all_reduce(input_, open_fp8_quant)
return get_tp_group().all_reduce(input_, use_new, open_fp8_quant, prefill_support)
def tensor_model_parallel_fused_allreduce_rmsnorm(
input_: torch.Tensor, residual_inp_: torch.Tensor, weight_: torch.Tensor, eps: float
input_: torch.Tensor,
residual_inp_: torch.Tensor,
weight_: torch.Tensor,
eps: float,
prefill_support: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
return get_tp_group().fused_allreduce_rmsnorm(input_, residual_inp_, weight_, eps)
return get_tp_group().fused_allreduce_rmsnorm(
input_, residual_inp_, weight_, eps, prefill_support
)
def tensor_model_parallel_fused_allreduce_rmsnorm_quant(
input_: torch.Tensor,
residual_inp_: torch.Tensor,
weight_: torch.Tensor,
eps: float,
prefill_support: bool = False,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
return get_tp_group().fused_allreduce_rmsnorm_quant(
input_, residual_inp_, weight_, eps, prefill_support
)
def tensor_model_parallel_fused_allreduce_rmsnorm_quant_per_group(
input_: torch.Tensor,
residual_inp_: torch.Tensor,
weight_: torch.Tensor,
eps: float,
group_size: int = 128,
prefill_support: bool = False,
emit_bf16: bool = False,
):
return get_tp_group().fused_allreduce_rmsnorm_quant_per_group(
input_, residual_inp_, weight_, eps, group_size, prefill_support, emit_bf16=emit_bf16
)
def tensor_model_parallel_fused_qknorm_allreduce(
qkv_in: torch.Tensor,
q_w: torch.Tensor,
k_w: torch.Tensor,
eps: float,
):
return get_tp_group().fused_qknorm_allreduce(qkv_in, q_w, k_w, eps)
def tensor_model_parallel_custom_all_gather(input_: torch.Tensor) -> torch.Tensor:
return get_tp_group().custom_all_gather(input_)
def tensor_model_parallel_reduce_scatter(
input_: torch.Tensor,
use_custom: bool = True,
dim: int = 0,
) -> torch.Tensor:
return get_tp_group().reduce_scatter_tensor(input_, use_custom, dim)
def tensor_model_parallel_all_gather(
input_: torch.Tensor, use_custom: bool = False, dim: int = -1
) -> torch.Tensor:
......@@ -59,3 +111,66 @@ def broadcast_tensor_dict(
if not torch.distributed.is_initialized():
return tensor_dict
return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
# ============================================================
# Custom group communication operations
# ============================================================
def _assert_has_custom_group():
assert has_custom_group(), (
"No custom group initialized. Call ensure_model_parallel_initialized "
"with custom_group_config to initialize custom groups."
)
def custom_all_reduce(
input_: torch.Tensor,
use_new: bool = True,
open_fp8_quant: bool = False,
group: Optional[str] = None,
) -> torch.Tensor:
"""All-reduce the input tensor across the user-specified custom group.
Args:
group: Name of the custom group. When only one custom group is
initialized this can be omitted. When multiple groups exist,
pass the group name to select which one to use.
"""
_assert_has_custom_group()
return get_custom_group(group).all_reduce(input_, use_new, open_fp8_quant)
def custom_all_gather(
input_: torch.Tensor,
use_custom: bool = True,
dim: int = 0,
group: Optional[str] = None,
) -> torch.Tensor:
"""All-gather the input tensor across the user-specified custom group.
Args:
group: Name of the custom group. When only one custom group is
initialized this can be omitted. When multiple groups exist,
pass the group name to select which one to use.
"""
_assert_has_custom_group()
return get_custom_group(group).all_gather(input_, use_custom, dim)
def custom_reduce_scatter(
input_: torch.Tensor,
use_custom: bool = True,
dim: int = 0,
group: Optional[str] = None,
) -> torch.Tensor:
"""Reduce-scatter the input tensor across the user-specified custom group.
Args:
group: Name of the custom group. When only one custom group is
initialized this can be omitted. When multiple groups exist,
pass the group name to select which one to use.
"""
_assert_has_custom_group()
return get_custom_group(group).reduce_scatter_tensor(input_, use_custom, dim)
......@@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import torch
from torch.distributed import ProcessGroup
......@@ -11,6 +13,13 @@ from aiter import logger
from .base_device_communicator import DeviceCommunicatorBase
def _env_flag(name: str, default: bool) -> bool:
val = os.environ.get(name)
if val is None:
return default
return val.strip().lower() in ("1", "true", "yes", "on")
class CudaCommunicator(DeviceCommunicatorBase):
def __init__(
self,
......@@ -64,9 +73,20 @@ class CudaCommunicator(DeviceCommunicatorBase):
if use_custom_allreduce and self.world_size > 1:
# Initialize a custom fast all-reduce implementation.
# AITER_AR_ENABLE_REG_CAPTURE controls whether inputs captured
# inside a CUDA graph are assumed to already live in the
# pre-registered IPC buffer (True, default), or whether the
# in-graph all-reduce should fall back to the unregistered
# copy-in path (False). Set this to "0" when callers cannot
# guarantee that captured input pointers were registered via
# ``CustomAllreduce.register_buffer``.
enable_register_for_capturing = _env_flag(
"AITER_AR_ENABLE_REG_CAPTURE", default=True
)
self.ca_comm = CustomAllreduce(
group=self.cpu_group,
device=self.device,
enable_register_for_capturing=enable_register_for_capturing,
# symm_mem_enabled=(
# self.symm_mem_comm is not None and not self.symm_mem_comm.disabled
# ),
......@@ -118,7 +138,13 @@ class CudaCommunicator(DeviceCommunicatorBase):
self.all2all_manager.__class__.__name__,
)
def all_reduce(self, input_, ca_fp8_quant: bool = False) -> torch.Tensor:
def all_reduce(
self,
input_,
use_new: bool = True,
ca_fp8_quant: bool = False,
prefill_support: bool = False,
) -> torch.Tensor:
# always try quick reduce first, then custom allreduce,
# and then pynccl. (quick reduce just for ROCM MI3*)
qr_comm = self.qr_comm
......@@ -137,7 +163,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
and not ca_comm.disabled
and ca_comm.should_custom_ar(input_)
):
out = ca_comm.custom_all_reduce(input_, ca_fp8_quant)
out = ca_comm.custom_all_reduce(input_, use_new=use_new, open_fp8_quant=ca_fp8_quant)
assert out is not None
return out
symm_mem_comm = self.symm_mem_comm
......@@ -159,7 +185,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
return out
def fused_allreduce_rmsnorm(
self, input_, res_inp_, weight_, eps
self, input_, res_inp_, weight_, eps, prefill_support: bool = False
) -> tuple[torch.Tensor, torch.Tensor]:
n = input_.shape[-1]
can_use_fuse_ar_rms = (
......@@ -174,10 +200,12 @@ class CudaCommunicator(DeviceCommunicatorBase):
and ca_comm.should_custom_ar(input_)
and can_use_fuse_ar_rms
):
res_out, out = ca_comm.custom_fused_ar_rms(input_, res_inp_, weight_, eps)
out, res_out = ca_comm.custom_fused_ar_rms(
input_, res_inp_, weight_, eps, use_1stage=prefill_support
)
assert out is not None
assert res_out is not None
return res_out, out
return out, res_out
# call split kernel
ar_out = self.all_reduce(input_)
out = torch.empty_like(ar_out)
......@@ -193,7 +221,138 @@ class CudaCommunicator(DeviceCommunicatorBase):
eps,
0,
)
return residual_out, out
return out, residual_out
def fused_allreduce_rmsnorm_quant(
self,
input_,
res_inp_,
weight_,
eps,
prefill_support: bool = False,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
total_bytes = input_.numel() * input_.element_size()
K = int(input_.shape[-1])
use_1stage = total_bytes <= 128 * 1024
# Hygon (gfx938/gfx946) kernel-level bug: the fused
# AR+RMSNorm+FP8-quant kernel produces 100% NaN for bf16 at
# K=4096 whenever use_1stage=False, regardless of which post-
# 1-stage path is dispatched (the 2-stage 128KB-512KB path AND
# the split >512KB path both fail). Empirically confirmed on
# shapes (17,4096), (32,4096), (128,4096), and (and very likely
# (512,4096)+ which we don't proceed to). The 2-stage kernel
# works for fp16 at the same K=4096, and for bf16 at K=7168 /
# K=8192; the failure is therefore K=4096-and-bf16-specific.
# Until the C++ kernel is fixed upstream, fall back to the
# Python split path (RMSNorm-only fused kernel + separate
# hip_quant) for the entire problematic configuration.
problematic_bf16_non_1stage = (
input_.dtype == torch.bfloat16
and not use_1stage
and K == 4096
)
if (
K in [512, 1024, 2048, 4096]
and total_bytes <= 4096 * 1024
and not problematic_bf16_non_1stage
):
out, res_out, scale_out = self.ca_comm.custom_fused_ar_rms_quant(
input_, res_inp_, weight_, eps, use_1stage
)
else:
out_, res_out = self.fused_allreduce_rmsnorm(
input_, res_inp_, weight_, eps, prefill_support
)
from aiter import get_hip_quant, QuantType
from aiter.utility.dtypes import fp8
hip_quant = get_hip_quant(QuantType.per_Token)
out, scale_out = hip_quant(out_, quant_dtype=fp8)
assert out is not None
assert res_out is not None
assert scale_out is not None
return out, res_out, scale_out
def fused_allreduce_rmsnorm_quant_per_group(
self,
input_,
res_inp_,
weight_,
eps,
group_size=128,
prefill_support: bool = False,
emit_bf16: bool = False,
):
total_bytes = input_.numel() * input_.element_size()
K = int(input_.shape[-1])
use_1stage = total_bytes <= 128 * 1024
out = res_out = scale_out = bf16_out = None
fused_ok = False
# See ``fused_allreduce_rmsnorm_quant`` for context, with one
# important difference: per-token quant's custom-kernel
# whitelist is K in {512, 1024, 2048, 4096}, so larger K values
# (6144 / 7168 / 8192) always go to the Python fallback there
# and never expose the kernel bug for those K. Per-group quant
# has a much wider whitelist (any K with K % group_size == 0
# and K <= 16384), so it surfaces the same bug at additional K
# values (K=4096 and K=6144 both empirically confirmed NaN;
# K=7168 / K=8192 untested but likely affected). Widen the
# fallback to all bf16 + non-1-stage configurations to be safe;
# the perf cost is limited since this only affects bf16 inputs
# whose total bytes exceed 128 KB (medium / large prefill).
problematic_bf16_non_1stage = (
input_.dtype == torch.bfloat16
and not use_1stage
)
if (
K % group_size == 0
and K <= 16384
and total_bytes < 8 * 1024 * 8192
and not problematic_bf16_non_1stage
):
try:
result = self.ca_comm.custom_fused_ar_rms_per_group_quant(
input_, res_inp_, weight_, eps, group_size, use_1stage,
emit_bf16=emit_bf16,
)
if emit_bf16:
out, res_out, scale_out, bf16_out = result
else:
out, res_out, scale_out = result
fused_ok = True
except Exception:
pass
if not fused_ok:
out_, res_out = self.fused_allreduce_rmsnorm(
input_, res_inp_, weight_, eps, prefill_support
)
from aiter import get_hip_quant, QuantType
from aiter.utility.dtypes import fp8
hip_quant = get_hip_quant(QuantType.per_1x128)
out, scale_out = hip_quant(out_, quant_dtype=fp8)
if emit_bf16:
bf16_out = out_
assert out is not None
assert res_out is not None
assert scale_out is not None
if emit_bf16:
assert bf16_out is not None
return out, res_out, scale_out, bf16_out
return out, res_out, scale_out
def fused_qknorm_allreduce(
self,
qkv_in,
q_w,
k_w,
eps,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
q_out, k_out, v_out = self.ca_comm.custom_fused_qknorm_ar(
qkv_in, q_w, k_w, eps
)
assert q_out is not None
assert k_out is not None
assert v_out is not None
return q_out, k_out, v_out
def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
world_size = self.world_size
......
"""
* Copyright (C) 2024-2025, The vLLM team.
* Copyright (C) Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2024-2026, The vLLM team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -14,8 +15,9 @@
* limitations under the License.
"""
import pickle
from contextlib import contextmanager
from typing import Any, List, Optional, Union
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.distributed as dist
......@@ -26,6 +28,7 @@ from torch.distributed import ProcessGroup
import aiter as ops
from aiter.dist.parallel_state import in_the_same_node_as
from aiter import logger
from aiter.utility.dtypes import fp8
try:
ops.meta_size()
......@@ -43,6 +46,273 @@ def is_weak_contiguous(inp: torch.Tensor):
)
# Wavefront width on AMD CDNA / gfx94x / gfx950. ``__shfl_xor`` in the
# fused per-group FP8 quant epilogue is scoped to a single wavefront, so
# ``threads_per_group = group_size / PACK_SIZE`` must fit inside it.
_AITER_AR_WAVEFRONT_SIZE = 64
def _validate_per_group_size(group_size: int, element_size: int, n: int) -> None:
"""Validate ``group_size`` for the fused AR + RMSNorm + per-group FP8
quant kernel. Mirrors the C++ host dispatcher checks in
``dispatchFusedAllReduceRMSNormQuantPerGroup`` so callers fail fast
with a clear Python-level ``ValueError`` (rather than a generic
``RuntimeError`` from the extension, which aborts CUDA-graph capture
asynchronously).
The fused epilogue imposes five constraints on ``group_size``:
(a) ``group_size > 0``
(b) ``group_size % PACK_SIZE == 0`` with ``PACK_SIZE = 16 // element_size``
(each thread owns a full 16-byte pack, so a group must be made of
whole packs).
(c) ``threads_per_group = group_size / PACK_SIZE`` must be a power of two
(butterfly ``__shfl_xor`` reduction strides ``{tpg/2, tpg/4, ..., 1}``).
(d) ``threads_per_group`` must fit inside a wavefront
(``<= 64`` on AMD CDNA); cross-warp shuffles do not exist on HIP.
(e) ``n % group_size == 0`` so ``num_groups = n / group_size`` is an
integer.
"""
if not isinstance(group_size, int):
raise TypeError(
f"per-group quant group_size must be int, got {type(group_size).__name__}"
)
if group_size <= 0:
raise ValueError(
f"per-group quant requires group_size > 0, got group_size={group_size}"
)
if element_size <= 0 or 16 % element_size != 0:
raise ValueError(
"per-group quant requires an element_size that divides 16 "
f"(bf16/fp16: 2), got element_size={element_size}"
)
pack_size = 16 // element_size
if group_size % pack_size != 0:
raise ValueError(
f"per-group quant requires group_size divisible by PACK_SIZE="
f"{pack_size} (16 // element_size), got group_size={group_size}"
)
threads_per_group = group_size // pack_size
if threads_per_group & (threads_per_group - 1) != 0:
raise ValueError(
"per-group quant requires group_size/PACK_SIZE to be a power of "
"two (butterfly __shfl_xor reduction), got "
f"group_size={group_size} PACK_SIZE={pack_size} "
f"threads_per_group={threads_per_group}"
)
if threads_per_group > _AITER_AR_WAVEFRONT_SIZE:
raise ValueError(
"per-group quant requires group_size/PACK_SIZE <= wavefront size "
f"({_AITER_AR_WAVEFRONT_SIZE}), got group_size={group_size} "
f"PACK_SIZE={pack_size} threads_per_group={threads_per_group}"
)
if n % group_size != 0:
raise ValueError(
f"per-group quant requires n divisible by group_size, "
f"got n={n} group_size={group_size}"
)
class IPCBuffer:
"""A single IPC-accessible device buffer.
Pure data container — owns a pre-allocated GPU allocation with a fixed
device address. All IPC handle / broadcast / registration logic lives
in IPCBufferPool.
When *uncached* is False (default), memory is allocated through PyTorch's
caching allocator (torch.empty). When True, memory is allocated via
hipExtMallocWithFlags with hipDeviceMallocUncached, bypassing the cache.
Uncached buffers are suitable for cross-GPU synchronization metadata and
signal buffers where cache coherence overhead is undesirable.
"""
def __init__(
self,
size: int,
device: torch.device,
uncached: bool = False,
):
self._size = size
self._uncached = uncached
if uncached:
self._buffer = None
self._raw_ptr = ops.allocate_meta_buffer(size)
else:
self._buffer = torch.empty(size, dtype=torch.uint8, device=device)
self._raw_ptr = self._buffer.data_ptr()
@property
def data_ptr(self) -> int:
return self._raw_ptr
@property
def tensor(self) -> torch.Tensor:
if self._buffer is None:
raise RuntimeError(
"Uncached IPCBuffer has no backing tensor; use .data_ptr"
)
return self._buffer
@property
def max_size(self) -> int:
return self._size
@property
def uncached(self) -> bool:
return self._uncached
def __del__(self):
if self._uncached and self._raw_ptr:
try:
ops.free_meta_buffer(self._raw_ptr)
except (AttributeError, TypeError):
pass
self._raw_ptr = 0
class IPCBufferPool:
"""Manages a collection of named IPCBuffers and provides IPC broadcast
infrastructure for cross-GPU communication.
Buffers are stored in an internal dict and accessed by string key.
Two sets of operations:
Eager mode (named internal buffers):
create(key, size) allocates a buffer and stores it under *key*.
get_ipc_meta(key) broadcasts IPC handles for that buffer.
Graph mode (arbitrary external tensors):
get_external_ipc_meta(tensor) broadcasts IPC handles for any tensor.
flush_graph_buffers(ar_ptr) batch-registers addresses that the C++
backend collected during CUDA graph capture.
"""
_pool_seq: int = 0
def __init__(self, device: torch.device, group: ProcessGroup):
self._device = device
self._group = group
self._rank = dist.get_rank(group=group)
self._world_size = dist.get_world_size(group=group)
self._buffers: Dict[str, IPCBuffer] = {}
self._store = dist.distributed_c10d._get_default_store()
self._assert_pure_tcp_store(self._store)
ranks_tag = "_".join(map(str, sorted(dist.get_process_group_ranks(group))))
self._store_key_prefix = f"aiter_ipc/p{IPCBufferPool._pool_seq}/g{ranks_tag}"
IPCBufferPool._pool_seq += 1
self._ipc_seq = 0
@staticmethod
def _assert_pure_tcp_store(store) -> None:
"""Verify the store is a pure-TCP KV store, free from any collective
communication backend (RCCL / gloo / MPI).
Emits a warning rather than aborting to allow non-TCP store setups."""
s = store
while isinstance(s, dist.PrefixStore):
s = s.underlying_store
if not isinstance(s, dist.TCPStore):
logger.warning(
"IPC metadata exchange prefers a pure-TCP KV store "
"(torch.distributed.TCPStore), got %s. "
"If IPC handle exchange fails, ensure MASTER_ADDR/MASTER_PORT "
"are set and the process group is initialised with a TCPStore.",
type(s).__name__,
)
# ---- Buffer lifecycle ----
def create(self, key: str, size: int, uncached: bool = False) -> IPCBuffer:
"""Allocate a new IPCBuffer and store it under *key*.
Args:
key: unique name for this buffer in the pool.
size: buffer size in bytes.
uncached: if True, allocate via hipMalloc (uncached);
if False (default), allocate via torch.empty (cached).
"""
if key in self._buffers:
raise KeyError(f"IPCBuffer '{key}' already exists in the pool")
buf = IPCBuffer(size, self._device, uncached=uncached)
self._buffers[key] = buf
return buf
def __getitem__(self, key: str) -> IPCBuffer:
return self._buffers[key]
def __contains__(self, key: str) -> bool:
return key in self._buffers
# ---- Eager mode: named buffer IPC meta ----
def get_ipc_meta(self, key: str) -> Tuple[List, List]:
"""Broadcast IPC handles for the named buffer across all ranks."""
buf = self._buffers[key]
return self._broadcast_ipc(buf.data_ptr)
# ---- Graph mode: external buffer IPC meta ----
def get_external_ipc_meta(self, tensor: torch.Tensor) -> Tuple[List, List]:
"""Broadcast IPC handles for an arbitrary external tensor."""
return self._broadcast_ipc(tensor.data_ptr())
def flush_graph_buffers(self, ar_ptr):
"""Batch-register buffer addresses collected during CUDA graph capture.
During graph capture the C++ backend records addresses of buffers that
are not yet IPC-registered. After capture ends this method exchanges
their IPC handles across all ranks and completes registration.
"""
count = ops.get_graph_buffer_count(ar_ptr)
if count == 0:
return
handle_sz = 64 # sizeof(hipIpcMemHandle_t)
handle = torch.empty(count * handle_sz, dtype=torch.uint8)
offset = torch.empty(count, dtype=torch.int64)
ops.get_graph_buffer_ipc_meta(ar_ptr, handle.data_ptr(), offset.data_ptr())
handles, offsets = self._gather_ipc_meta((handle, offset))
logger.info("Registering %d cuda graph addresses", count)
ops.register_graph_buffers(
ar_ptr,
[h.data_ptr() for h in handles],
[o.data_ptr() for o in offsets],
)
# ---- Private IPC primitives ----
def _broadcast_ipc(self, data_ptr: int) -> Tuple[List, List]:
"""Get IPC handle for *data_ptr* and broadcast across all ranks."""
handle = torch.empty(64, dtype=torch.uint8) # sizeof(hipIpcMemHandle_t)
ops.get_meta_buffer_ipc_handle(data_ptr, handle.data_ptr())
return self._gather_ipc_meta((handle, 0))
def _gather_ipc_meta(self, shard_data) -> Tuple[List, List]:
"""Exchange IPC metadata (handle + offset) across all ranks via TCP store.
Each rank writes its serialised *shard_data* under a unique key, then
reads every other rank's data. ``store.get()`` blocks until the key
is available, providing natural barrier semantics without involving any
collective communication backend.
"""
seq = self._ipc_seq
self._ipc_seq += 1
prefix = f"{self._store_key_prefix}/{seq}"
self._store.set(f"{prefix}/r{self._rank}", pickle.dumps(shard_data))
handles = []
offsets = []
for r in range(self._world_size):
raw = self._store.get(f"{prefix}/r{r}")
h, o = pickle.loads(raw)
handles.append(h)
offsets.append(o)
return handles, offsets
class CustomAllreduce:
_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
......@@ -52,7 +322,8 @@ class CustomAllreduce:
self,
group: ProcessGroup,
device: Union[int, str, torch.device],
max_size=8192 * 1024 * 8,
max_size=1024 * 1024 * 1024, # 2GB bf16/half
enable_register_for_capturing: bool = True,
) -> None:
"""
Args:
......@@ -136,7 +407,7 @@ class CustomAllreduce:
# test P2P capability, this checks software/cudaruntime support
# this is expensive to compute at the first time
# then we cache the result
# On hygon GPU, p2p is always enabled between XGMI connected GPUs
# On AMD GPU, p2p is always enabled between XGMI connected GPUs
# if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
# logger.warning(
# "Custom allreduce is disabled because your platform lacks "
......@@ -145,15 +416,7 @@ class CustomAllreduce:
# return
self.disabled = False
# buffers memory are owned by this Python class and passed to C++
# meta data composes of two parts: meta data for synchronization
# (256 bytes) and a temporary buffer for storing intermediate
# allreduce results.
# if current_platform.is_rocm():
self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
# This is a pre-registered IPC buffer. In eager mode, input tensors
# are first copied into this buffer before allreduce is performed
self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
self.enable_register_for_capturing = enable_register_for_capturing
# This is a buffer for storing the tuples of pointers pointing to
# IPC buffers from all ranks. Each registered tuple has size of
# 8*world_size bytes where world_size is at most 8. Allocating 8MB
......@@ -165,24 +428,64 @@ class CustomAllreduce:
self.max_size = max_size
self.rank = rank
self.world_size = world_size
handle = ops.get_meta_buffer_ipc_handle(self.meta)
shard_data = (
handle, # ipc handle to base ptr
0, # offset of base ptr
)
handles, offsets = self._gather_ipc_meta(shard_data)
# Use a gloo-based barrier to synchronise init status across all ranks.
# If any rank fails (e.g. allocate_meta_buffer throws on platforms that
# do not support hipDeviceMallocUncached), the barrier ensures all other
# ranks learn about the failure instead of hanging forever in
# _gather_ipc_meta's store.get().
init_ok = torch.ones(1, dtype=torch.int32)
try:
# Create IPC buffer pool and allocate all named buffers.
# "meta" uses hipAlloc (uncached) for synchronization metadata +
# intermediate allreduce temp storage.
# "input" uses torchAlloc (cached) for D2D relay in eager mode.
self._pool = IPCBufferPool(self.device, self.group)
self._pool.create("meta", ops.meta_size() + max_size * 2, uncached=True)
self._pool.create("input", max_size)
except Exception as e:
init_ok[0] = 0
logger.warning(
"CustomAllreduce IPC buffer allocation failed (rank %d): %s. "
"Custom allreduce will be disabled.",
rank, e,
)
# All ranks must agree on whether init succeeded before proceeding to
# _gather_ipc_meta (which would hang if a peer rank is absent).
dist.all_reduce(init_ok, op=dist.ReduceOp.MIN, group=self.group)
if init_ok[0] == 0:
self.disabled = True
return
# Exchange meta buffer IPC handles to initialize C++ backend
handles, offsets = self._pool.get_ipc_meta("meta")
self.fully_connected = fully_connected
self._ptr = ops.init_custom_ar(
self.meta, self.rank_data, handles, offsets, rank, self.fully_connected
self._pool["meta"].data_ptr,
self.rank_data.data_ptr(),
self.rank_data.numel(),
[h.data_ptr() for h in handles],
offsets,
rank,
self.fully_connected,
)
# Register input IPC buffer with the C++ backend
handles, offsets = self._pool.get_ipc_meta("input")
ops.register_input_buffer(
self._ptr,
self._pool["input"].data_ptr,
[h.data_ptr() for h in handles],
offsets,
)
self.register_buffer(self.buffer)
@contextmanager
def capture(self):
"""
The main responsibility of this context manager is the
`register_graph_buffers` call at the end of the context.
flush_graph_buffers call at the end of the context.
It records all the buffer addresses used in the CUDA graph.
"""
try:
......@@ -191,61 +494,27 @@ class CustomAllreduce:
finally:
self._IS_CAPTURING = False
if not self.disabled:
self.register_graph_buffers()
def _get_ipc_meta(self, inp: torch.Tensor):
# if current_platform.is_rocm():
if 1:
# _share_cuda_() doesn't accept meta buffer not allocated from
# PyTorch cache allocator, use direct HIP call to get IPC handle
handle = ops.get_meta_buffer_ipc_handle(inp)
shard_data = (
handle, # ipc handle to base ptr
0, # offset of base ptr
)
else:
data = inp.untyped_storage()._share_cuda_()
shard_data = (
data[1], # ipc handle to base ptr
data[3], # offset of base ptr
)
return self._gather_ipc_meta(shard_data)
def _gather_ipc_meta(self, shard_data):
# Note: don't use `[[None]] * self.world_size` here
# because it will create a list of the same reference
all_data: List[Optional[Any]] = [[None] for i in range(self.world_size)]
all_data[self.rank][0] = shard_data
ranks = dist.get_process_group_ranks(group=self.group)
ranks.sort()
for i, rank in enumerate(ranks):
dist.broadcast_object_list(
all_data[i], src=rank, group=self.group, device="cpu"
)
# we cannot directly use `dist.all_gather_object` here
# because it is incompatible with `gloo` backend under inference mode.
# see https://github.com/pytorch/pytorch/issues/126032 for details.
self._pool.flush_graph_buffers(self._ptr)
handles = []
offsets = []
for i in range(len(all_data)):
handles.append(all_data[i][0][0]) # type: ignore
offsets.append(all_data[i][0][1]) # type: ignore
return handles, offsets
def register_input_buffer(self, inp: torch.Tensor):
"""Register an external tensor as an IPC input buffer."""
handles, offsets = self._pool.get_external_ipc_meta(inp)
ops.register_input_buffer(
self._ptr, inp.data_ptr(), [h.data_ptr() for h in handles], offsets
)
def register_buffer(self, inp: torch.Tensor):
handles, offsets = self._get_ipc_meta(inp)
ops.register_buffer(self._ptr, inp, handles, offsets)
def register_output_buffer(self, out: torch.Tensor):
"""Register an external tensor as an IPC output buffer."""
handles, offsets = self._pool.get_external_ipc_meta(out)
ops.register_output_buffer(
self._ptr, out.data_ptr(), [h.data_ptr() for h in handles], offsets
)
def register_graph_buffers(self):
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
handles, offsets = self._gather_ipc_meta((handle, offset))
logger.info("Registering %d cuda graph addresses", len(offset))
ops.register_graph_buffers(self._ptr, handles, offsets)
"""Batch-register graph-captured buffer addresses."""
self._pool.flush_graph_buffers(self._ptr)
def should_custom_ar(self, inp: torch.Tensor):
def should_custom_ar(self, inp: torch.Tensor, prefill_support: bool = False):
if self.disabled:
return False
inp_size = inp.numel() * inp.element_size()
......@@ -256,8 +525,28 @@ class CustomAllreduce:
return False
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
# little performance improvement over NCCL.
# In allreduce 2stage writemode, use 2x tmp buffer
if self.world_size == 2 or self.fully_connected:
return inp_size <= self.max_size
# decode
if not prefill_support:
return inp_size <= 8192 * 8192
# prefill
else:
return inp_size <= (self.max_size / 2)
return False
def should_custom_ag(self, inp: torch.Tensor):
if self.disabled:
return False
inp_size = inp.numel() * inp.element_size()
if inp_size % 16 != 0:
return False
if not is_weak_contiguous(inp):
return False
# all_gather output = input * world_size, so the per-rank input
# must fit within max_size / world_size
if self.world_size == 2 or self.fully_connected:
return inp_size <= (self.max_size / (self.world_size * 2))
return False
def all_reduce(
......@@ -265,8 +554,9 @@ class CustomAllreduce:
inp: torch.Tensor,
*,
out: Optional[torch.Tensor] = None,
use_new: bool = True,
open_fp8_quant: bool = False,
registered: bool = False,
registered_input: bool = False,
):
"""Performs an out-of-place all reduce.
......@@ -276,17 +566,22 @@ class CustomAllreduce:
"""
if out is None:
out = torch.empty_like(inp)
assert is_weak_contiguous(out), "output tensor is not weak-contiguous"
reg_inp = 0 if registered_input else self._pool["input"].data_ptr
reg_inp_bytes = 0 if registered_input else self._pool["input"].max_size
ops.all_reduce(
self._ptr,
inp,
out,
use_new,
open_fp8_quant,
None if registered else self.buffer,
reg_inp,
reg_inp_bytes,
)
return out
def custom_all_reduce(
self, input: torch.Tensor, open_fp8_quant: bool = False
self, input: torch.Tensor, use_new: bool = True, open_fp8_quant: bool = False
) -> Optional[torch.Tensor]:
# when custom allreduce is disabled, this will be None
if self.disabled or not self.should_custom_ar(input):
......@@ -294,7 +589,10 @@ class CustomAllreduce:
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.all_reduce(
input, open_fp8_quant=open_fp8_quant, registered=True
input,
use_new=use_new,
open_fp8_quant=open_fp8_quant,
registered_input=self.enable_register_for_capturing,
)
else:
# if warm up, mimic the allocation pattern
......@@ -306,34 +604,100 @@ class CustomAllreduce:
# be small(<=1% of overall latency) compared to the performance
# gains of using custom kernels
return self.all_reduce(
input, open_fp8_quant=open_fp8_quant, registered=False
input,
use_new=use_new,
open_fp8_quant=open_fp8_quant,
registered_input=False,
)
def all_gather_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
def reduce_scatter(
self,
inp: torch.Tensor,
out: torch.Tensor,
*,
registered: bool = False,
):
assert is_weak_contiguous(out), "output tensor is not weak-contiguous"
reg = 0 if registered else self._pool["input"].data_ptr
reg_bytes = 0 if registered else self._pool["input"].max_size
ops.reduce_scatter(
self._ptr,
inp,
out,
reg,
reg_bytes,
)
def custom_reduce_scatter(
self, input: torch.Tensor, output: torch.Tensor
) -> Optional[torch.Tensor]:
# when custom allreduce is disabled, this will be None
if self.disabled or not self.should_custom_ar(input):
return None
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.reduce_scatter(input, output, registered=True)
else:
return self.reduce_scatter(input, output, registered=False)
def _allgather_out_shape(self, inp: torch.Tensor, dim: int):
ndim = inp.dim()
if dim == 0:
return (inp.shape[0] * self.world_size,) + inp.shape[1:]
if dim == -1 or dim == ndim - 1:
return inp.shape[:-1] + (inp.shape[-1] * self.world_size,)
print(
f"[aiter] allgather does not support dim={dim}, falling back to 1-D output"
)
return (inp.numel() * self.world_size,)
def all_gather_reg(self, inp: torch.Tensor, out: torch.Tensor = None, dim: int = 0):
if out is None:
out = torch.empty(
inp.numel() * self.world_size, dtype=inp.dtype, device=inp.device
self._allgather_out_shape(inp, dim),
dtype=inp.dtype,
device=inp.device,
)
ops.all_gather_reg(self._ptr, inp, out)
assert is_weak_contiguous(out), "output tensor is not weak-contiguous"
ops.all_gather_reg(
self._ptr,
inp,
out,
dim,
)
return out
def all_gather_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
def all_gather_unreg(
self, inp: torch.Tensor, out: torch.Tensor = None, dim: int = 0
):
if out is None:
out = torch.empty(
inp.numel() * self.world_size, dtype=inp.dtype, device=inp.device
self._allgather_out_shape(inp, dim),
dtype=inp.dtype,
device=inp.device,
)
ops.all_gather_unreg(self._ptr, inp, self.buffer, out)
assert is_weak_contiguous(out), "output tensor is not weak-contiguous"
ops.all_gather_unreg(
self._ptr,
inp,
self._pool["input"].data_ptr,
out,
self._pool["input"].max_size,
dim,
)
return out
def custom_all_gather(self, inp: torch.Tensor) -> Optional[torch.Tensor]:
def custom_all_gather(
self, inp: torch.Tensor, dim: int = 0
) -> Optional[torch.Tensor]:
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.all_gather_reg(inp)
return self.all_gather_reg(inp, dim=dim)
else:
print("allgather capture hipgraph error")
return torch.zeros_like(inp)
else:
return self.all_gather_unreg(inp)
return self.all_gather_unreg(inp, dim=dim)
def fused_ar_rms(
self,
......@@ -342,51 +706,314 @@ class CustomAllreduce:
*,
res_out: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
scale_out: Optional[torch.Tensor] = None,
w: torch.Tensor,
eps: float,
registered: bool = False,
use_1stage: bool = False,
post_per_token_quant: bool = False,
):
if out is None:
out = torch.empty_like(inp)
if res_out is None:
res_out = torch.empty_like(inp)
ops.fused_allreduce_rmsnorm(
reg = 0 if registered else self._pool["input"].data_ptr
reg_bytes = 0 if registered else self._pool["input"].max_size
if not post_per_token_quant:
if out is None:
out = torch.empty_like(inp)
assert is_weak_contiguous(out), "output tensor is not weak-contiguous"
ops.fused_allreduce_rmsnorm(
self._ptr,
inp,
res_inp,
res_out,
out,
w,
eps,
reg,
reg_bytes,
use_1stage,
)
return out, res_out
else:
if out is None:
out = torch.empty(inp.shape, dtype=fp8, device=inp.device)
assert is_weak_contiguous(out), "output tensor is not weak-contiguous"
if scale_out is None:
scale_out = torch.empty(
inp.shape[:-1] + (1,), dtype=torch.float32, device=inp.device
)
ops.fused_allreduce_rmsnorm_quant(
self._ptr,
inp,
res_inp,
res_out,
out,
scale_out,
w,
eps,
reg,
reg_bytes,
use_1stage,
)
return out, res_out, scale_out
def custom_fused_ar_rms(
self,
input: torch.Tensor,
residual_inp: torch.Tensor,
weight: torch.Tensor,
eps: float,
use_1stage: bool = False,
) -> Optional[torch.Tensor]:
# when custom allreduce is disabled, this will be None
if self.disabled or not self.should_custom_ar(input):
return None
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.fused_ar_rms(
input,
residual_inp,
w=weight,
eps=eps,
registered=True,
use_1stage=use_1stage,
)
else:
return torch.zeros_like(input), torch.zeros_like(input)
else:
return self.fused_ar_rms(
input,
residual_inp,
w=weight,
eps=eps,
registered=False,
use_1stage=use_1stage,
)
def custom_fused_ar_rms_quant(
self,
input: torch.Tensor,
residual_inp: torch.Tensor,
weight: torch.Tensor,
eps: float,
use_1stage: bool = False,
):
# when custom allreduce is disabled, this will be None
if self.disabled or not self.should_custom_ar(input):
return None
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.fused_ar_rms(
input,
residual_inp,
w=weight,
eps=eps,
registered=True,
use_1stage=use_1stage,
post_per_token_quant=True,
)
else:
dummy_out = torch.zeros(input.shape, dtype=fp8, device=input.device)
dummy_scale_out = torch.zeros(
input.shape[:-1] + (1,), dtype=torch.float32, device=input.device
)
return dummy_out, torch.zeros_like(input), dummy_scale_out
else:
return self.fused_ar_rms(
input,
residual_inp,
w=weight,
eps=eps,
registered=False,
use_1stage=use_1stage,
post_per_token_quant=True,
)
def fused_ar_rms_per_group_quant(
self,
inp: torch.Tensor,
res_inp: torch.Tensor,
*,
w: torch.Tensor,
eps: float,
group_size: int = 128,
registered: bool = False,
use_1stage: bool = False,
emit_bf16: bool = False,
):
K = inp.shape[-1]
# Fail fast on bad ``group_size`` at the Python boundary. Mirrors
# the C++ host dispatcher checks; catching it here surfaces a
# synchronous ``ValueError`` instead of a post-launch
# ``RuntimeError`` that would only fire at CUDA-graph replay and
# would be much harder to attribute to the offending call site.
_validate_per_group_size(group_size, inp.element_size(), K)
res_out = torch.empty_like(inp)
num_groups = K // group_size
out = torch.empty(inp.shape, dtype=fp8, device=inp.device)
scale_out = torch.empty(
inp.shape[:-1] + (num_groups,), dtype=torch.float32, device=inp.device
)
# Optional bf16/fp16 mirror of the pre-quantization normed output.
# Requested by GDN-style layers that also need an unquantized view
# (e.g. Qwen3.5 in_proj_ba). Zero-overhead when not requested
# because the kernel branches on the pointer being non-null.
bf16_out = None
bf16_ptr = 0
if emit_bf16:
bf16_out = torch.empty_like(inp)
bf16_ptr = int(bf16_out.data_ptr())
reg = 0 if registered else self._pool["input"].data_ptr
reg_bytes = 0 if registered else self._pool["input"].max_size
ops.fused_allreduce_rmsnorm_quant_per_group(
self._ptr,
inp,
res_inp,
res_out,
out,
scale_out,
w,
eps,
None if registered else self.buffer,
group_size,
reg,
reg_bytes,
use_1stage,
bf16_ptr,
)
return res_out, out
if emit_bf16:
return out, res_out, scale_out, bf16_out
return out, res_out, scale_out
def fused_qknorm_ar(
self,
qkv_in: torch.Tensor,
q_w: torch.Tensor,
k_w: torch.Tensor,
eps: float,
registered: bool = False,
):
dtype = qkv_in.dtype
device = qkv_in.device
hidden_dim_q = q_w.shape[-1]
hidden_dim_k = k_w.shape[-1]
token_num = qkv_in.shape[0]
hidden_dim_v = qkv_in.shape[1] - (hidden_dim_q + hidden_dim_k)
q_out = torch.empty((token_num, hidden_dim_q), dtype=dtype, device=device)
k_out = torch.empty((token_num, hidden_dim_k), dtype=dtype, device=device)
v_out = torch.empty((token_num, hidden_dim_v), dtype=dtype, device=device)
reg = 0 if registered else self._pool["input"].data_ptr
reg_bytes = 0 if registered else self._pool["input"].max_size
ops.fused_qknorm_allreduce(
self._ptr,
qkv_in,
q_w,
k_w,
q_out,
k_out,
v_out,
eps,
reg,
reg_bytes,
)
return q_out, k_out, v_out
def custom_fused_qknorm_ar(
self,
qkv_in: torch.Tensor,
q_w: torch.Tensor,
k_w: torch.Tensor,
eps: float,
) -> [torch.Tensor, torch.Tensor, torch.Tensor]:
dtype = qkv_in.dtype
if self.disabled:
return (
torch.empty((qkv_in.shape[0], q_w.shape[-1]), dtype=dtype, device=qkv_in.device),
torch.empty((qkv_in.shape[0], k_w.shape[-1]), dtype=dtype, device=qkv_in.device),
torch.empty((qkv_in.shape[0], qkv_in.shape[1] - q_w.shape[-1] - k_w.shape[-1]), dtype=dtype, device=qkv_in.device)
)
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.fused_qknorm_ar(
qkv_in,
q_w,
k_w,
eps,
registered=True,
)
else:
return (
torch.empty((qkv_in.shape[0], q_w.shape[-1]), dtype=dtype, device=qkv_in.device),
torch.empty((qkv_in.shape[0], k_w.shape[-1]), dtype=dtype, device=qkv_in.device),
torch.empty((qkv_in.shape[0], qkv_in.shape[1] - q_w.shape[-1] - k_w.shape[-1]), dtype=dtype, device=qkv_in.device)
)
else:
return self.fused_qknorm_ar(
qkv_in,
q_w,
k_w,
eps,
registered=False,
)
def custom_fused_ar_rms(
def custom_fused_ar_rms_per_group_quant(
self,
input: torch.Tensor,
residual_inp: torch.Tensor,
weight: torch.Tensor,
eps: float,
) -> Optional[torch.Tensor]:
# when custom allreduce is disabled, this will be None
group_size: int = 128,
use_1stage: bool = False,
emit_bf16: bool = False,
):
if self.disabled or not self.should_custom_ar(input):
return None
if self._IS_CAPTURING:
if torch.cuda.is_current_stream_capturing():
return self.fused_ar_rms(
input, residual_inp, w=weight, eps=eps, registered=True
return self.fused_ar_rms_per_group_quant(
input,
residual_inp,
w=weight,
eps=eps,
group_size=group_size,
registered=True,
use_1stage=use_1stage,
emit_bf16=emit_bf16,
)
else:
return torch.zeros_like(input), torch.zeros_like(input)
K = input.shape[-1]
num_groups = K // group_size
dummy_out = torch.zeros(input.shape, dtype=fp8, device=input.device)
dummy_scale = torch.zeros(
input.shape[:-1] + (num_groups,),
dtype=torch.float32,
device=input.device,
)
if emit_bf16:
return (
dummy_out,
torch.zeros_like(input),
dummy_scale,
torch.zeros_like(input),
)
return dummy_out, torch.zeros_like(input), dummy_scale
else:
return self.fused_ar_rms(
input, residual_inp, w=weight, eps=eps, registered=False
return self.fused_ar_rms_per_group_quant(
input,
residual_inp,
w=weight,
eps=eps,
group_size=group_size,
registered=False,
use_1stage=use_1stage,
emit_bf16=emit_bf16,
)
def close(self):
if not self.disabled and self._ptr:
ops.dispose(self._ptr)
if not self.disabled and getattr(self, "_ptr", 0):
try:
ops.dispose(self._ptr)
except (AttributeError, TypeError):
pass
self._ptr = 0
def __del__(self):
......
......@@ -319,7 +319,11 @@ class GroupCoordinator:
yield graph_capture_context
def all_reduce(
self, input_: torch.Tensor, ca_fp8_quant: bool = False
self,
input_: torch.Tensor,
use_new: bool = True,
open_fp8_quant: bool = False,
prefill_support: bool = False,
) -> torch.Tensor:
"""
User-facing all-reduce function before we actually call the
......@@ -340,7 +344,7 @@ class GroupCoordinator:
return input_
return all_reduce_(
input_, group_name=self.unique_name, ca_fp8_quant=ca_fp8_quant
input_, group_name=self.unique_name, ca_fp8_quant=open_fp8_quant
)
def _all_reduce_out_place(
......@@ -348,7 +352,7 @@ class GroupCoordinator:
) -> torch.Tensor:
if self.device_communicator is None:
raise ValueError("No device communicator found")
return self.device_communicator.all_reduce(input_, ca_fp8_quant)
return self.device_communicator.all_reduce(input_, ca_fp8_quant=ca_fp8_quant)
def fused_allreduce_rmsnorm(
self,
......@@ -356,11 +360,54 @@ class GroupCoordinator:
residual_inp_: torch.Tensor,
weight_: torch.Tensor,
eps: float,
prefill_support: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
return fused_allreduce_rmsnorm_(
input_, residual_inp_, weight_, eps, group_name=self.unique_name
)
def fused_allreduce_rmsnorm_quant(
self,
input_: torch.Tensor,
residual_inp_: torch.Tensor,
weight_: torch.Tensor,
eps: float,
prefill_support: bool = False,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
if self.device_communicator is None:
raise ValueError("No device communicator found")
return self.device_communicator.fused_allreduce_rmsnorm_quant(
input_, residual_inp_, weight_, eps, prefill_support
)
def fused_allreduce_rmsnorm_quant_per_group(
self,
input_: torch.Tensor,
residual_inp_: torch.Tensor,
weight_: torch.Tensor,
eps: float,
group_size: int = 128,
prefill_support: bool = False,
emit_bf16: bool = False,
):
if self.device_communicator is None:
raise ValueError("No device communicator found")
return self.device_communicator.fused_allreduce_rmsnorm_quant_per_group(
input_, residual_inp_, weight_, eps, group_size, prefill_support,
emit_bf16=emit_bf16,
)
def fused_qknorm_allreduce(
self,
qkv_in: torch.Tensor,
q_w: torch.Tensor,
k_w: torch.Tensor,
eps: float,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
if self.device_communicator is None:
raise ValueError("No device communicator found")
return self.device_communicator.fused_qknorm_allreduce(qkv_in, q_w, k_w, eps)
def _fused_allreduce_rmsnorm_out_place(
self,
input_: torch.Tensor,
......@@ -375,12 +422,31 @@ class GroupCoordinator:
)
def _all_gather_out_place(self, input_: torch.Tensor) -> torch.Tensor:
ca_comm = self.device_communicator.ca_comm
assert ca_comm is not None
assert not ca_comm.disabled
out = ca_comm.custom_all_gather(input_)
assert out is not None
return out
ca_comm = (
self.device_communicator.ca_comm
if self.device_communicator is not None
else None
)
if ca_comm is not None and not ca_comm.disabled:
out = ca_comm.custom_all_gather(input_)
assert out is not None
return out
# Fallback: ca_comm unavailable (e.g. non-tp custom group).
# Try pynccl first (graph-safe), then torch.distributed.
world_size = self.world_size
out_shape = (input_.shape[0] * world_size,) + input_.shape[1:]
output_tensor = torch.empty(
out_shape, dtype=input_.dtype, device=input_.device
)
if self.device_communicator is not None:
pynccl_comm = self.device_communicator.pynccl_comm
if pynccl_comm is not None and not pynccl_comm.disabled:
pynccl_comm.all_gather(output_tensor, input_)
return output_tensor
torch.distributed.all_gather_into_tensor(
output_tensor, input_, group=self.device_group
)
return output_tensor
def custom_all_gather(self, input_: torch.Tensor) -> torch.Tensor:
return outplace_all_gather(input_, group_name=self.unique_name)
......@@ -390,6 +456,33 @@ class GroupCoordinator:
raise ValueError("No device communicator found")
return self.device_communicator.reduce_scatter(input_, dim)
def reduce_scatter_tensor(
self,
input_: torch.Tensor,
use_custom: bool = True,
dim: int = 0,
):
world_size = self.world_size
assert world_size > 1, "error! world_size = 1"
assert (
input_.numel() % world_size == 0
), "input shape error, input.numel() % world_size should equals to 0"
if input_.shape[0] % world_size == 0:
out_dim0 = input_.shape[0] // world_size
out_shape = (out_dim0,) + input_.shape[1:]
else:
out_shape = (input_.numel() // world_size,)
if use_custom and self.device_communicator is not None:
return self.device_communicator.reduce_scatter(input_, dim)
else:
output_ = torch.empty(
out_shape, dtype=input_.dtype, device=input_.device
)
torch.distributed.reduce_scatter_tensor(
output_, input_, group=self.device_group
)
return output_
def all_gather(
self, input_: torch.Tensor, use_custom: bool = False, dim: int = -1
......@@ -897,6 +990,71 @@ def get_ep_group() -> GroupCoordinator:
return _EP
_CUSTOM: Dict[str, GroupCoordinator] = {}
def has_custom_group() -> bool:
"""Return whether any custom group is initialized."""
return bool(_CUSTOM)
def get_custom_group(
name: Optional[str] = None,
) -> "Union[GroupCoordinator, Dict[str, GroupCoordinator]]":
"""Get custom group coordinator(s).
- If only one custom group is initialized, returns the GroupCoordinator
instance directly (name is optional).
- If multiple custom groups are initialized and name is None, returns the
full dict so the caller can select by name.
- If name is given, returns that specific GroupCoordinator.
"""
assert _CUSTOM, "custom allreduce group is not initialized"
if name is not None:
assert name in _CUSTOM, (
f"custom group '{name}' not found, "
f"available: {list(_CUSTOM.keys())}"
)
return _CUSTOM[name]
if len(_CUSTOM) == 1:
return next(iter(_CUSTOM.values()))
return dict(_CUSTOM)
class CustomGroupConfig:
"""Configuration builder for custom communication groups.
Each group is defined by a rank list that can be:
- 1D List[int]: all ranks form a single communication group,
e.g. [0,1,2,3,4,5,6,7] → one TP8 group
- 2D List[List[int]]: multiple independent subgroups,
e.g. [[0,1,2,3],[4,5,6,7]] → two independent TP4 groups
Usage:
config = CustomGroupConfig()
config.add_group("tp_group", [[0,1,2,3],[4,5,6,7]])
ensure_model_parallel_initialized(..., custom_group_config=config.data())
Or pass a raw dict directly:
ensure_model_parallel_initialized(..., custom_group_config={
"tp_group": [[0,1,2,3],[4,5,6,7]],
})
"""
def __init__(self):
self._groups: Dict[str, List] = {}
def add_group(self, name: str, ranks: List) -> "CustomGroupConfig":
assert name not in self._groups, f"custom group '{name}' already exists"
assert ranks, f"custom group '{name}': ranks list must not be empty"
self._groups[name] = ranks
return self
def data(self) -> Dict[str, List]:
assert self._groups, "no custom groups have been added"
return dict(self._groups)
# kept for backward compatibility
get_pipeline_model_parallel_group = get_pp_group
......@@ -996,6 +1154,7 @@ def initialize_model_parallel(
# decode_context_model_parallel_size: Optional[int] = 1,
backend: Optional[str] = None,
data_parallel_size: int = 1,
custom_group_config: Optional[Dict[str, List]] = None,
) -> None:
"""
Initialize model parallel groups.
......@@ -1006,6 +1165,12 @@ def initialize_model_parallel(
pipeline_model_parallel_size: number of GPUs used for pipeline model
parallelism.
backend: name of torch distributed communication backend.
custom_group_config: optional dict mapping group names to rank lists.
Each value can be:
- 1D List[int]: all ranks form a single group,
e.g. [0,1,2,3,4,5,6,7]
- 2D List[List[int]]: multiple independent subgroups,
e.g. [[0,1,2,3],[4,5,6,7]]
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
......@@ -1109,6 +1274,55 @@ def initialize_model_parallel(
group_ranks, get_world_group().local_rank, backend, group_name="ep"
)
# Build the custom allreduce group(s) (optional).
global _CUSTOM
assert not _CUSTOM, "custom allreduce group is already initialized"
if custom_group_config is not None:
for gname, ranks in custom_group_config.items():
assert (
isinstance(ranks, list) and len(ranks) > 0
), f"custom group '{gname}': value must be a non-empty list"
if all(isinstance(r, int) for r in ranks):
group_ranks = [ranks]
elif all(isinstance(g, list) for g in ranks):
group_ranks = ranks
subgroup_size = len(group_ranks[0])
for g in group_ranks:
assert len(g) == subgroup_size, (
f"custom group '{gname}': all subgroups must "
f"have the same size, expected {subgroup_size} "
f"but got {len(g)}"
)
assert all(isinstance(r, int) for r in g), (
f"custom group '{gname}': subgroup elements "
f"must be integers"
)
else:
raise AssertionError(
f"custom group '{gname}': value must be List[int] "
f"(1D) or List[List[int]] (2D)"
)
all_ranks_flat = [r for g in group_ranks for r in g]
assert len(all_ranks_flat) == world_size, (
f"custom group '{gname}': total ranks "
f"({len(all_ranks_flat)}) must equal world_size ({world_size})"
)
assert len(set(all_ranks_flat)) == world_size, (
f"custom group '{gname}': contains duplicate ranks"
)
assert set(all_ranks_flat) == set(range(world_size)), (
f"custom group '{gname}': must cover all ranks 0..{world_size - 1}"
)
_CUSTOM[gname] = init_model_parallel_group(
group_ranks,
get_world_group().local_rank,
backend,
group_name=f"custom_{gname}",
)
logger.info(
"rank %s in world size %s is assigned as "
"DP rank %s, PP rank %s, TP rank %s, EP rank %s",
......@@ -1126,6 +1340,7 @@ def ensure_model_parallel_initialized(
pipeline_model_parallel_size: int,
backend: Optional[str] = None,
data_parallel_size: int = 1,
custom_group_config: Optional[Dict[str, List]] = None,
) -> None:
"""Helper to initialize model parallel groups if they are not initialized,
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
......@@ -1138,6 +1353,7 @@ def ensure_model_parallel_initialized(
pipeline_model_parallel_size,
backend,
data_parallel_size,
custom_group_config,
)
return
......@@ -1209,6 +1425,11 @@ def destroy_model_parallel():
_PP.destroy()
_PP = None
global _CUSTOM
for g in _CUSTOM.values():
g.destroy()
_CUSTOM.clear()
def destroy_distributed_environment():
global _WORLD
......
......@@ -733,11 +733,11 @@ def torch_moe_blockscale(
# [expert, model_dim/blk_m, inter_dim/blk_k]
fc2_scale=None,
expert_mask=None,
computeType=torch.float32,
):
computeType = dtypes.fp32
hidden_states = hidden_states.to(computeType)
w1 = w1.to(computeType)
w2 = w2.to(computeType)
hidden_states = hidden_states.float().to(computeType)
w1 = w1.float().to(computeType)
w2 = w2.float().to(computeType)
token_num, topk = topk_ids.shape
expert, model_dim, inter_dim = w2.shape
B, D = hidden_states.shape
......@@ -767,9 +767,8 @@ def torch_moe_blockscale(
nblk_n = inter_dim // blk_n
nblk_k = model_dim // blk_k
if fc1_scale is not None:
# gose to quant D_w8a8/w8a8
# blk_n, blk_k = scale_blks
# expert, nblk_n, nblk_k = fc1_scale.shape
fc1_scale = fc1_scale.to(computeType)
fc2_scale = fc2_scale.to(computeType)
fc1_scale = rearrange(
fc1_scale.view(-1, 1)
.repeat(1, blk_n * blk_k)
......
......@@ -12,13 +12,13 @@ from aiter import logger
from aiter import per_token_quant_hip, per_block_quant_wrapper, get_hip_quant
from aiter import ActivationType, QuantType, dtypes
from aiter import silu_and_mul,gelu_and_mul
from aiter.ops.triton.fused_moe import (
triton_moe_sum,
triton_silu_and_mul,
triton_gelu_and_mul,
triton_relu2,
from aiter.ops.triton.fused_moe import triton_moe_sum
from aiter.ops.triton.moe_activation import (
_normalize_activation_and_gate,
_apply_activation,
)
from aiter.jit.core import AITER_ROOT_DIR
# from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
# from vllm.model_executor.layers.quantization.utils.int8_utils import (
......@@ -111,6 +111,7 @@ def run_fused_experts_asm_impl(hidden_states: torch.Tensor,
dtype,
inplace,
activation,
None, # is_gated
use_fp8_w8a8,
use_int8_w8a8,
use_int8_w4a8,
......@@ -181,6 +182,7 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
dtype: torch.dtype,
inplace: bool = False,
activation: str = "silu",
is_gated: Optional[bool] = None,
use_fp8_w8a8: bool = False,
use_int8_w8a8: bool = False,
use_int8_w4a8: bool = False,
......@@ -200,7 +202,12 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
persist_cu: Optional[int] = 0,
use_shuffle: Optional[int] = 0,
solution_id: Optional[str] = None,
routed_scaling_factor: Optional[float] = 1.0)-> torch.Tensor:
routed_scaling_factor: Optional[float] = 1.0,
gemm1_alpha: Optional[float] = None,
gemm1_limit: Optional[float] = None)-> torch.Tensor:
activation, is_gated = _normalize_activation_and_gate(activation, is_gated)
# Check constraints.
if use_int8_w4a8:
assert block_shape[0] == 0 and block_shape[1] == 64, "[ERROR]ASM Fused MoE only support w4a8 block_shape=64 now."
......@@ -342,14 +349,14 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
2,
config["SOL_ID1"],
config["BLOCK_SIZE_M"])
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
if dtype == torch.bfloat16:
if block_shape is not None and block_shape[1] == 32:
aiter.asm_fmoe_stage2(d_w2_out,
......@@ -442,14 +449,15 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
odtype,
config["PERSIST_GROUP1"],
use_shuffle)
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
bridge_q,bridge_scale = per_token_quant_hip(d_silu)
#bridge_q,bridge_scale = per_token_quant_int8(d_silu)
......@@ -515,14 +523,15 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
config["SOL_ID1"],
odtype,
config["PERSIST_GROUP1"])
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
#quant_func = get_hip_quant(QuantType.per_1x64)
#bridge_q,bridge_scale = quant_func(d_silu, quant_dtype=dtypes.i8)
......@@ -586,14 +595,15 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
odtype,
config["PERSIST_GROUP1"],
use_shuffle)
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
#FIXME: aiter quant method performance is little worse than triton. Change it latter!!
bridge_q, bridge_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(d_silu, quant_dtype=torch.int8)
......@@ -657,14 +667,15 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
odtype,
config["PERSIST_GROUP1"],
use_shuffle)
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
bridge_q,bridge_scale= per_token_quant_hip(d_silu, quant_dtype=torch.float8_e4m3fn)
aiter.asm_fmoe_a8(d_w2_out,
......@@ -726,14 +737,15 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
odtype,
config["PERSIST_GROUP1"],
use_shuffle)
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
bridge_q,bridge_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(d_silu, quant_dtype=torch.float8_e4m3fn)
aiter.asm_fmoe_a8(d_w2_out,
......@@ -795,16 +807,14 @@ def fused_experts_asm_impl(hidden_states: torch.Tensor,
config["PERSIST_GROUP1"],
use_shuffle)
#return d_w1_out
if activation == "silu":
triton_silu_and_mul(d_silu,d_w1_out)
# silu_and_mul(d_silu,d_w1_out)
elif activation == "gelu":
triton_gelu_and_mul(d_silu,d_w1_out)
# gelu_and_mul(d_silu,d_w1_out)
elif activation == "relu2":
triton_relu2(d_silu,d_w1_out)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=d_silu,
ffn1_out_2d=d_w1_out.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
aiter.asm_fmoe_a8(d_w2_out,
d_silu,
......@@ -1136,4 +1146,4 @@ def calculate_persist_groups(persist_cu, config, quant_type):
if config[f"SOL_ID{i}"] in sol_id_table:
config[f"PERSIST_GROUP{i}"] = persist_cu * sol_id_table[config[f'SOL_ID{i}']]
else:
config[f"PERSIST_GROUP{i}"] = persist_cu
\ No newline at end of file
config[f"PERSIST_GROUP{i}"] = persist_cu
......@@ -17,10 +17,14 @@ import time
from aiter.test_common import perftest
import aiter
from aiter import dtypes
from aiter import moe_c_silu_and_mul,moe_c_moe_sum, per_token_quant_hip
from aiter import moe_c_silu_and_mul,moe_c_moe_sum_opt_v2, per_token_quant_hip
from aiter.jit.utils.torch_guard import torch_compile_guard
from aiter.ops.triton.fused_moe import triton_moe_sum
from triton.language.extra import libdevice
from aiter.ops.triton.moe_activation import (
_normalize_activation_and_gate,
_apply_activation,
)
logger = logging.getLogger(__name__)
......@@ -1960,6 +1964,82 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
**config,
)
_FLOAT_MOE_DTYPES = (torch.float16, torch.bfloat16, torch.float32)
_QUANTIZED_ACTIVATION_DTYPES = (torch.int8, torch.float8_e4m3fn)
def _is_prequantized_activation(
hidden_states_dtype: torch.dtype,
a_scale: Optional[torch.Tensor],
) -> bool:
return (
a_scale is not None
or hidden_states_dtype in _QUANTIZED_ACTIVATION_DTYPES
)
def _resolve_moe_compute_dtype(
hidden_states_dtype: torch.dtype,
compute_dtype: Optional[torch.dtype] = None,
*,
prequantized: bool = False,
) -> torch.dtype:
"""Resolve fp16/bf16/fp32 dtype for GEMM outputs, caches, and compute_type."""
if hidden_states_dtype in _FLOAT_MOE_DTYPES:
return hidden_states_dtype
if prequantized or hidden_states_dtype in _QUANTIZED_ACTIVATION_DTYPES:
if compute_dtype is not None:
assert compute_dtype in _FLOAT_MOE_DTYPES, (
f"compute_dtype must be fp16/bf16/fp32, got {compute_dtype}")
return compute_dtype
return torch.bfloat16
raise ValueError(
f"Unsupported hidden_states dtype: {hidden_states_dtype}")
def _torch_dtype_to_triton(dtype: torch.dtype):
if dtype == torch.bfloat16:
return tl.bfloat16
if dtype == torch.float16:
return tl.float16
if dtype == torch.float32:
return tl.float32
raise ValueError(f"Unsupported compute dtype for triton: {dtype}")
def _is_marlin_tensorwise_scale(
B_scale: Optional[torch.Tensor],
num_experts: int,
) -> bool:
# Marlin W8A8 tensorwise path expects one scale per expert,
# represented as (E, 1, 1) to stay compatible with existing 3D scale checks.
return B_scale is not None and B_scale.shape == (num_experts, 1, 1)
def _validate_prequant_marlin_activation(
A: torch.Tensor,
A_scale: torch.Tensor,
expected_dtype,
block_shape: Optional[List[int]],
B: torch.Tensor,
B_scale: torch.Tensor,
) -> None:
"""Validate pre-quantized activation (A, A_scale) for marlin MoE kernels."""
assert A_scale is not None
allowed = (expected_dtype,) if isinstance(expected_dtype, torch.dtype) else tuple(expected_dtype)
assert A.dtype in allowed, (
f"pre-quantized A must be one of {allowed}, got {A.dtype}")
if block_shape is None:
assert A_scale.shape[-1] == 1, (
f"per-token A_scale last dim must be 1, got shape {A_scale.shape}")
else:
assert len(block_shape) == 2
block_n, block_k = block_shape[0], block_shape[1]
assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
B: torch.Tensor,
C: torch.Tensor,
......@@ -1983,6 +2063,7 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
use_int4_w4a16: bool,
use_int4_w4a16_base: bool,
is_bottom: bool,
key_selected:int,
block_shape: Optional[List[int]] = None) -> None:
find_best = os.getenv("WHICH_TO_TEST")
assert topk_weights.stride(1) == 1
......@@ -1991,12 +2072,13 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
if use_fp8_w8a8:
assert B_scale is not None
if block_shape is None:
A, A_scale = per_token_quant_hip(A,quant_dtype=torch.float8_e4m3fn)
if A_scale is not None:
# Pre-quantized fp8 activation; skip internal quantization.
_validate_prequant_marlin_activation(
A, A_scale, (torch.float8_e4m3fn, torch.int8),
block_shape, B, B_scale)
elif block_shape is None:
A, A_scale = per_token_quant_hip(A, quant_dtype=torch.float8_e4m3fn)
else:
assert len(block_shape) == 2
block_n, block_k = block_shape[0], block_shape[1]
......@@ -2006,7 +2088,11 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
elif use_int8_w8a8:
assert B_scale is not None
if block_shape is None:
if A_scale is not None:
# Pre-quantized int8 activation; skip internal quantization.
_validate_prequant_marlin_activation(
A, A_scale, torch.int8, block_shape, B, B_scale)
elif block_shape is None:
A, A_scale = moe_kernel_prepare_input(
A=A,
B=B,
......@@ -2019,7 +2105,6 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
per_channel_quant=True,
block_shape=None
)
else:
assert len(block_shape) == 2
block_n, block_k = block_shape[0], block_shape[1]
......@@ -2029,7 +2114,11 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
elif use_int8_w4a8:
assert B_scale is not None
if block_shape is None:
if A_scale is not None:
# Pre-quantized int8 activation; skip internal quantization.
_validate_prequant_marlin_activation(
A, A_scale, torch.int8, block_shape, B, B_scale)
elif block_shape is None:
A, A_scale = moe_kernel_prepare_input(
A=A,
B=B,
......@@ -2043,7 +2132,6 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
per_channel_quant=True,
block_shape=None
)
else:
assert len(block_shape) == 2
block_n, block_k = block_shape[0], block_shape[1]
......@@ -2135,15 +2223,24 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
else :
bit = 4 if (use_int4_w4a16 or use_int4_w4a16_base) else 8
# print("calling wna16 awq -------")
if (bit == 8 and use_int8_w8a16) :
# print("calling w8a16 awq -------")
aiter.moe_c_moe_w8a16_gemm_awq(A, C, B, B_scale, B_zp,
topk_weights if mul_routed_weight else None,
sorted_token_ids, expert_ids,
num_tokens_post_padded, top_k,
config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"],
config["BLOCK_SIZE_K"], bit)
B = B.view(torch.uint32)
if is_bottom:
# print("calling w8a16 awq gemm2-------")
aiter.moe_c_moe_gemm_marlin_w8a16(A, B , C, B_scale, topk_weights, #B处应该传shuffle权重 待修改
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], 1)
else:
# print("calling w8a16 awq gemm1-------")
aiter.moe_c_moe_gemm_marlin_w8a16(A, B, C, B_scale, None, #B处应该传shuffle权重 待修改
sorted_token_ids, expert_ids, num_tokens_post_padded,8, config["MODE"], 1)
# print("calling wna16 awq -------")
# # print("calling w8a16 awq -------")
# aiter.moe_c_moe_w8a16_gemm_awq(A, C, B, B_scale, B_zp,
# topk_weights if mul_routed_weight else None,
# sorted_token_ids, expert_ids,
# num_tokens_post_padded, top_k,
# config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"],
# config["BLOCK_SIZE_K"], bit)
return
elif use_int4_w4a16_base :
......@@ -2185,19 +2282,29 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
# print("calling w4a16 awq end-------")
return
elif (use_int8_w8a8 and block_shape == None):
if is_bottom:
# print("B.shape",B.shape)
assert B.shape[1] in [7168,6144,4096,3072,2048] , f" K = {B.shape[1]} is not in support"
assert B.shape[2] in [128,256,384,768,2048] , f" N = {B.shape[2]} is not in support"
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8(A, B, C, A_scale, B_scale,topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k)
else :
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k)
if _is_marlin_tensorwise_scale(B_scale, B.shape[0]):
if is_bottom:
# print("calling w8a8 tensor wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_tensorwise(A, B, C, A_scale, B_scale, topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded, 1, config["MODE"], top_k, key_selected)
else:
# print("calling w8a8 tensor wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_tensorwise(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded, top_k, config["MODE"], top_k, key_selected)
else:
if is_bottom:
# print("B.shape",B.shape)
assert B.shape[1] in [7168,6144,4096,3072,2048] , f" K = {B.shape[1]} is not in support"
assert B.shape[2] in [128,256,384,512,768,1024,2048] , f" N = {B.shape[2]} is not in support"
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8(A, B, C, A_scale, B_scale,topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k,key_selected)
else :
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k,key_selected)
return
elif (use_int8_w4a8 and block_shape == None):
if is_bottom:
......@@ -2212,7 +2319,7 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
aiter.moe_c_moe_gemm_marlin_w4a8(A, B, C, A_scale, B_scale,topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"],top_k)
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k,key_selected)
# end_event.record()
# end_event.synchronize()
......@@ -2231,7 +2338,7 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
# start_event.record()
aiter.moe_c_moe_gemm_marlin_w4a8(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"],top_k)
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k,key_selected)
# end_event.record()
......@@ -2242,16 +2349,28 @@ def invoke_fused_moe_kernel_marlin(A: torch.Tensor,
return
elif (use_fp8_w8a8 and block_shape == None):
if is_bottom:
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_fp8(A, B, C, A_scale, B_scale,topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k)
else :
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_fp8(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k)
if _is_marlin_tensorwise_scale(B_scale, B.shape[0]):
if is_bottom:
# print("calling fp8 w8a8 tensor wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_fp8_tensorwise(A, B, C, A_scale, B_scale,topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k,key_selected)
else :
# print("calling fp8 w8a8 tensor wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_fp8_tensorwise(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k,key_selected)
else:
if is_bottom:
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_fp8(A, B, C, A_scale, B_scale,topk_weights,
sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k,key_selected)
else :
# print("calling w8a8 channel wise -------")
aiter.moe_c_moe_gemm_marlin_w8a8_fp8(A, B, C, A_scale, B_scale, None,
sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k,key_selected)
return
......@@ -2591,7 +2710,10 @@ def try_get_optimal_moe_config_marlin(
if configs:
# If an optimal configuration map has been found, look up the
# optimal config
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
key_selected = min(configs.keys(), key=lambda x: abs(x - M))
config = configs[key_selected]
config["key_selected"] = key_selected
else:
# Else use the default config
config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
......@@ -2756,6 +2878,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
kloops2: int,
nloops2: int,
activation: Optional[str] = None,
is_gated: Optional[bool] = None,
use_fp8_w8a8: bool = False,
use_int8_w8a8: bool = False,
use_int8_w4a8: bool = False,
......@@ -2771,23 +2894,27 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[List[int]] = None,
routed_scaling_factor: Optional[float] = 1.0) -> None:
routed_scaling_factor: Optional[float] = 1.0,
gemm1_alpha: Optional[float] = None,
gemm1_limit: Optional[float] = None,
compute_dtype: Optional[torch.dtype] = None) -> None:
if activation is None:
activation = "silu"
if (use_int4_w4a16 or (use_int8_w8a8 and block_shape == None) or (use_fp8_w8a8 and block_shape == None) or (use_int8_w4a8 and block_shape == None) ):
if (use_int4_w4a16 or (use_int8_w8a8 and block_shape == None) or (use_fp8_w8a8 and block_shape == None) or (use_int8_w8a16 and block_shape == None) or (use_int8_w4a8 and block_shape == None) ):
fused_experts_impl_marlin(hidden_states, w1, w2, topk_weights, topk_ids, MODE1, MODE2, BM,
True, activation, use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16,
True, activation,is_gated, use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16,
use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map,
w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
a2_scale, block_shape, routed_scaling_factor)
a2_scale, block_shape, routed_scaling_factor,gemm1_alpha,gemm1_limit,
compute_dtype=compute_dtype)
else:
fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids ,BM,BN,BK,kloops, nloops,BN2,
BK2,kloops2,nloops2,True,
activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map,
w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
block_shape, routed_scaling_factor)
......@@ -2836,6 +2963,7 @@ def outplace_fused_experts(
kloops2: int,
nloops2: int,
activation: Optional[str] = None,
is_gated: Optional[bool] = None,
use_fp8_w8a8: bool = False,
use_int8_w8a8: bool = False,
use_int8_w4a8: bool = False,
......@@ -2851,17 +2979,21 @@ def outplace_fused_experts(
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[List[int]] = None,
routed_scaling_factor: Optional[float] = 1.0) -> torch.Tensor:
routed_scaling_factor: Optional[float] = 1.0,
gemm1_alpha: Optional[float] = None,
gemm1_limit: Optional[float] = None,
compute_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
if activation is None:
activation = "silu"
if (use_int4_w4a16 or (use_int8_w8a8 and block_shape == None) or (use_fp8_w8a8 and block_shape == None) or (use_int8_w4a8 and block_shape == None) ):
if (use_int4_w4a16 or (use_int8_w8a8 and block_shape == None) or (use_fp8_w8a8 and block_shape == None) or (use_int8_w8a16 and block_shape == None) or (use_int8_w4a8 and block_shape == None) ):
return fused_experts_impl_marlin(hidden_states, w1, w2, topk_weights, topk_ids, MODE1, MODE2, BM,
False, activation, use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16,
False, activation,is_gated, use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16,
use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map,
w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
a2_scale, block_shape, routed_scaling_factor)
a2_scale, block_shape, routed_scaling_factor,gemm1_alpha,gemm1_limit,
compute_dtype=compute_dtype)
return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,BM,BN,BK,kloops,nloops,BN2,
......@@ -2981,6 +3113,7 @@ def moe_c_fused_experts(hidden_states: torch.Tensor,
nloops2: int = 1,
inplace: bool = False,
activation: Optional[str] = None,
is_gated: Optional[bool] = None,
use_fp8_w8a8: bool = False,
use_int8_w8a8: bool = False,
use_int8_w4a8: bool = False,
......@@ -2996,7 +3129,11 @@ def moe_c_fused_experts(hidden_states: torch.Tensor,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[List[int]] = None,
routed_scaling_factor: Optional[float] = 1.0) -> torch.Tensor:
routed_scaling_factor: Optional[float] = 1.0,
gemm1_alpha: Optional[float] = None,
gemm1_limit: Optional[float] = None,
compute_dtype: Optional[torch.dtype] = None,
) -> torch.Tensor:
# assert not (use_int8_w4a8 and hidden_states.shape[0] < 1024) , "only support M >= 1024"
......@@ -3007,19 +3144,21 @@ def moe_c_fused_experts(hidden_states: torch.Tensor,
inplace_fused_experts(
hidden_states, w1, w2, topk_weights, topk_ids,MODE1,MODE2,BM,BN,BK,kloops,nloops,BN2,
BK2,kloops2,nloops2,activation,
BK2,kloops2,nloops2,activation,is_gated,
use_fp8_w8a8, use_int8_w8a8, use_int8_w4a8,use_int8_w8a16, use_int4_w4a16, use_int4_w4a16_base, global_num_experts,
expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
block_shape, routed_scaling_factor)
block_shape, routed_scaling_factor,gemm1_alpha,gemm1_limit,
compute_dtype=compute_dtype)
# print("hidden_states",hidden_states)
return hidden_states
else:
return outplace_fused_experts(
hidden_states, w1, w2, topk_weights, topk_ids,MODE1,MODE2,BM,BN,BK,kloops,nloops,BN2,
BK2,kloops2,nloops2,activation,
BK2,kloops2,nloops2,activation,is_gated,
use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16, use_int4_w4a16, use_int4_w4a16_base, global_num_experts,
expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
block_shape, routed_scaling_factor)
block_shape, routed_scaling_factor,gemm1_alpha,gemm1_limit,
compute_dtype=compute_dtype)
......@@ -3175,6 +3314,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
BM: int,
inplace: bool = False,
activation: str = "silu",
is_gated: Optional[bool] = None,
use_fp8_w8a8: bool = False,
use_int8_w8a8: bool = False,
use_int8_w4a8: bool = False,
......@@ -3190,7 +3330,29 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[List[int]] = None,
routed_scaling_factor: Optional[float] = 1.0):
routed_scaling_factor: Optional[float] = 1.0,
gemm1_alpha: Optional[float] = None,
gemm1_limit: Optional[float] = None,
compute_dtype: Optional[torch.dtype] = None):
activation, is_gated = _normalize_activation_and_gate(activation, is_gated)
prequantized_input = _is_prequantized_activation(
hidden_states.dtype, a1_scale)
if prequantized_input:
assert use_fp8_w8a8 or use_int8_w8a8 or use_int8_w4a8, (
"pre-quantized activation requires use_fp8_w8a8, "
"use_int8_w8a8, or use_int8_w4a8")
else:
assert hidden_states.dtype in _FLOAT_MOE_DTYPES, (
f"hidden_states must be fp16/bf16/fp32, got {hidden_states.dtype}")
compute_dtype = _resolve_moe_compute_dtype(
hidden_states.dtype,
compute_dtype,
prequantized=prequantized_input,
)
# Check constraints.
if use_int4_w4a16 or use_int8_w4a8:
assert hidden_states.shape[1] // 2 == w1.shape[
......@@ -3202,9 +3364,6 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
assert hidden_states.dtype in [
torch.float32, torch.float16, torch.bfloat16
]
num_tokens, _ = hidden_states.shape
E, N, _ = w1.shape
......@@ -3220,7 +3379,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
use_int8_w4a8=use_int8_w4a8,
use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
dtype=hidden_states.dtype)
dtype=compute_dtype)
get_config_func = functools.partial(
try_get_optimal_moe_config_marlin,
......@@ -3244,7 +3403,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
# cache3, we're done with cache1
cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
device=hidden_states.device,
dtype=hidden_states.dtype)
dtype=compute_dtype)
intermediate_cache1 = cache13[:M * top_k_num * N].view(
(M, topk_ids.shape[1], N))
intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
......@@ -3253,25 +3412,21 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
# This needs separate memory since it's used concurrently with cache1
intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
device=hidden_states.device,
dtype=hidden_states.dtype)
dtype=compute_dtype)
if hidden_states.dtype == torch.bfloat16:
compute_type = tl.bfloat16
elif hidden_states.dtype == torch.float16:
compute_type = tl.float16
elif hidden_states.dtype == torch.float32:
compute_type = tl.float32
else:
raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
compute_type = _torch_dtype_to_triton(compute_dtype)
if inplace:
assert not prequantized_input, (
"inplace is not supported when hidden_states is pre-quantized")
assert hidden_states.dtype == compute_dtype
out_hidden_states = hidden_states
# out_hidden_states = torch.empty_like(hidden_states)
else:
out_hidden_states = torch.empty_like(hidden_states)
out_hidden_states = torch.empty(
(num_tokens, hidden_states.shape[1]),
device=hidden_states.device,
dtype=compute_dtype,
)
......@@ -3298,11 +3453,14 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
curr_a1_scale = (a1_scale[begin_chunk_idx:end_chunk_idx]
if a1_scale is not None else None)
curr_a2_scale = (a2_scale[begin_chunk_idx:end_chunk_idx]
if a2_scale is not None else None)
find_best = os.environ.get("WHICH_TO_TEST")
if(find_best):
if(use_int4_w4a16 or use_int8_w4a8):
if(use_int4_w4a16 or use_int8_w4a8 or use_int8_w8a16 ):
sorted_token_ids, expert_ids, num_tokens_post_padded = (
moe_align_block_size(curr_topk_ids, BM,
global_num_experts, expert_map))
......@@ -3312,13 +3470,13 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
topk_weights,
global_num_experts,
7168,
hidden_states.dtype,
compute_dtype,
BM,
expert_mask=None,
)
else:
if(use_int4_w4a16 or use_int8_w4a8):
if(use_int4_w4a16 or use_int8_w4a8 or use_int8_w8a16 ):
# print("*****************************",config["BLOCK_SIZE_M"])
sorted_token_ids, expert_ids, num_tokens_post_padded = (
moe_align_block_size(curr_topk_ids, config["BLOCK_SIZE_M"],
......@@ -3333,7 +3491,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
topk_weights,
global_num_experts,
7168,
hidden_states.dtype,
compute_dtype,
config["BLOCK_SIZE_M"],
expert_mask=None,
)
......@@ -3350,7 +3508,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
invoke_fused_moe_kernel_marlin(curr_hidden_states,
w1,
intermediate_cache1,
a1_scale,
curr_a1_scale,
w1_scale,
w1_zp,
curr_topk_weights,
......@@ -3370,16 +3528,24 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
use_int4_w4a16=use_int4_w4a16,
use_int4_w4a16_base=use_int4_w4a16_base,
is_bottom = False,
key_selected=config["key_selected"],
block_shape=block_shape)
if activation == "silu":
moe_c_silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
# elif activation == "gelu":
# torch.ops._C.gelu_and_mul(intermediate_cache2,
# intermediate_cache1.view(-1, N))
rows_per_block,vec_size =aiter.load_silu_tune_config(M * top_k_num,N // 2)
moe_c_silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N),rows_per_block,vec_size)
else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}")
_apply_activation(
activation=activation,
is_gated=is_gated,
activated_out=intermediate_cache2,
ffn1_out_2d=intermediate_cache1.view(-1, N),
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit,
)
# use_moe_wna16_cuda = should_moe_wna16_use_cuda(
# num_valid_tokens=topk_ids.numel(),
......@@ -3400,7 +3566,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
invoke_fused_moe_kernel_marlin(intermediate_cache2,
w2,
intermediate_cache3,
a2_scale,
curr_a2_scale,
w2_scale,
w2_zp,
curr_topk_weights,
......@@ -3420,6 +3586,7 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
use_int4_w4a16=use_int4_w4a16,
use_int4_w4a16_base=use_int4_w4a16_base,
is_bottom = True,
key_selected=config["key_selected"] ,
block_shape=block_shape)
mode_use_triton_moe_sum = out_hidden_states.dtype == torch.float16 or \
out_hidden_states.dtype == torch.bfloat16 or \
......@@ -3432,8 +3599,8 @@ def fused_experts_impl_marlin(hidden_states: torch.Tensor,
# out_hidden_states[begin_chunk_idx:end_chunk_idx])
triton_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states[begin_chunk_idx:end_chunk_idx] , routed_scaling_factor)
else:
moe_c_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
out_hidden_states[begin_chunk_idx:end_chunk_idx],curr_topk_ids)
moe_c_moe_sum_opt_v2(intermediate_cache3.view(*intermediate_cache3.shape),
out_hidden_states[begin_chunk_idx:end_chunk_idx],routed_scaling_factor)
......
......@@ -552,7 +552,7 @@ def build_module(
"-Wno-vla-cxx-extension",
"-Wno-undefined-func-template",
"-Wno-macro-redefined",
"-Wno-missing-template-arg-list-after-template-kw",
# "-Wno-missing-template-arg-list-after-template-kw",
"-fgpu-flush-denormals-to-zero",
]
......@@ -794,6 +794,7 @@ def compile_ops(
fc_name: Optional[str] = None,
gen_func: Optional[Callable[..., dict[str, Any]]] = None,
gen_fake: Optional[Callable[..., Any]] = None,
develop: bool = False,
):
def decorator(func):
func.arg_checked = False
......@@ -897,12 +898,18 @@ def compile_ops(
doc_str = re.sub(pattern, r"Optional[\1]", doc_str)
for el in enum_types:
doc_str = re.sub(f" aiter.*{el} ", f" {el} ", doc_str)
try:
from ..utility.aiter_types import aiter_tensor_t as _aiter_tensor_t
except ImportError:
_aiter_tensor_t = None
namespace = {
"List": List,
"Optional": Optional,
"torch": torch,
"typing": typing,
}
if _aiter_tensor_t is not None:
namespace["aiter_tensor_t"] = _aiter_tensor_t
exec(
f"from aiter import*\ndef {doc_str}: pass",
......@@ -955,13 +962,34 @@ def compile_ops(
return True
if not func.arg_checked:
func.arg_checked = check_args()
if develop:
func.arg_checked = True # skip type-check when develop=True; tensors are converted below
else:
func.arg_checked = check_args()
if AITER_LOG_MORE == 2:
from ..test_common import log_args
log_args(func, *args, **kwargs)
# develop=True: convert torch.Tensor → pybind aiter_tensor_t and inject HIP stream.
# develop=False (default): all existing ops pass through unchanged.
if develop:
import torch
from ..utility.dtypes import torch_to_aiter_pybind
args = tuple(
torch_to_aiter_pybind(a) if isinstance(a, torch.Tensor) else a
for a in args
)
kwargs = {
k: (torch_to_aiter_pybind(v) if isinstance(v, torch.Tensor) else v)
for k, v in kwargs.items()
}
module._set_current_hip_stream(
torch.cuda.current_stream().cuda_stream
)
return op(*args, **kwargs)
@torch_compile_guard(device="cuda", gen_fake=gen_fake, calling_func_=func)
......
......@@ -80,6 +80,41 @@
"hipify": "True",
"blob_gen_cmd": "''"
},
"module_grouped_gemm": {
"srcs": [
"f'{AITER_CSRC_DIR}/pybind/grouped_gemm_ck_pybind.cu'",
"f'{AITER_CSRC_DIR}/py_itfs_ck/grouped_gemm_kernels.cu'",
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm/grouped_gemm.cpp'",
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm/instances/grouped_gemm_fp16.cpp'",
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm/instances/grouped_gemm_bf16.cpp'",
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm/instances/grouped_gemm_fp8.cpp'",
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm/instances/grouped_gemm_int8.cpp'"
],
"flags_extra_cc": [
"'-DCK_TILE_GROUPED_GEMM_FAST_BUILD'",
"'-DCK_TILE_GROUPED_GEMM_FAST_FP16'",
"'-DCK_TILE_GROUPED_GEMM_FAST_BF16'",
"'-DCK_TILE_GROUPED_GEMM_FAST_FP8'",
"'-DCK_TILE_GROUPED_GEMM_FAST_INT8'",
"'-DCK_TILE_GROUPED_GEMM_FAST_RC_ONLY'"
],
"flags_extra_hip": [
"'-DCK_TILE_GROUPED_GEMM_FAST_BUILD'",
"'-DCK_TILE_GROUPED_GEMM_FAST_FP16'",
"'-DCK_TILE_GROUPED_GEMM_FAST_BF16'",
"'-DCK_TILE_GROUPED_GEMM_FAST_FP8'",
"'-DCK_TILE_GROUPED_GEMM_FAST_INT8'",
"'-DCK_TILE_GROUPED_GEMM_FAST_RC_ONLY'"
],
"extra_ldflags": "None",
"extra_include": [
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm'",
"f'{CK_DIR}/example_hcu/ck_tile/19_grouped_gemm/instances'"
],
"verbose": "False",
"hipify": "False",
"blob_gen_cmd": "''"
},
"module_moe_utils":{
"srcs": [
"f'{AITER_CSRC_DIR}/pybind/moe_utils_pybind.cu'",
......@@ -351,7 +386,7 @@
"f'{MOE_C_DIR}/csrc_for_aiter'",
"f'{AITER_CSRC_DIR}/py_itfs_moe_c/moe_c.cu'"
],
"flags_extra_cc": ["' -mllvm -support-768-vgprs=true -mllvm -disable-machine-sink '"
"flags_extra_cc": ["' -mllvm -support-768-vgprs=true -mllvm -disable-machine-sink -w'"
],
"flags_extra_hip": [],
"extra_ldflags": "None",
......@@ -386,5 +421,17 @@
"verbose": "False",
"hipify": "True",
"blob_gen_cmd": "''"
},
"module_mhc": {
"srcs": [
"f'{AITER_CSRC_DIR}/pybind/mhc_pybind.cu'",
"f'{AITER_CSRC_DIR}/kernels/mhc_kernels.cu'"
],
"flags_extra_cc": [],
"flags_extra_hip": [],
"extra_ldflags": "None",
"extra_include": [],
"verbose": "False",
"blob_gen_cmd": "''"
}
}
......@@ -22,9 +22,11 @@ class MoeQuantType:
"""Quantization types supported by get_aiter_moe_config / aiter_moe."""
W16A16 = "w16a16"
W4A16 = "w4a16"
W8A8 = "w8a8"
FP8_W8A8 = "fp8_w8a8"
W4A8 = "w4a8"
W8A8 = "int8_w8a8"
FP8_W8A8 = "fp8_w8a8"
INT8_W8A16 = "int8_w8a16"
FP8_W8A16 = "fp8_w8a16"
@dataclass
......@@ -90,6 +92,16 @@ def _try_get_moe_c_config(
is_bottom=False,
use_moe_wna16_cuda=True,
)
elif quant_type == MoeQuantType.INT8_W8A16:
configs = get_moe_configs_marlin(
E=e,
N=n,
dtype="int8_w8a16",
block_n=0,
block_k=block_size if block_size else 0,
is_bottom=False,
use_moe_wna16_cuda=True,
)
else:
return None
......@@ -139,7 +151,7 @@ def _try_get_asm_config(
if quant_type == MoeQuantType.W8A8:
from .fused_moe_asm_wna16 import decode_sol_0
asm_quant_type = AsmMoeQuantType.INT8_W8A8_C if (block_size == 0 or block_size is None) else AsmMoeQuantType.INT8_W8A8
solution = get_moe_asm_solution(
arch=arch,
token=m,
......@@ -147,7 +159,7 @@ def _try_get_asm_config(
model_dim=k,
expert=e,
topk=top_k,
quant_type=AsmMoeQuantType.INT8_W8A8,
quant_type=asm_quant_type,
)
if solution == "default":
return None
......@@ -155,7 +167,7 @@ def _try_get_asm_config(
if quant_type == MoeQuantType.FP8_W8A8:
from .fused_moe_asm_wna16 import decode_sol_0
asm_quant_type = AsmMoeQuantType.F8_W8A8_C if (block_size == 0 or block_size is None) else AsmMoeQuantType.F8_W8A8
solution = get_moe_asm_solution(
arch=arch,
token=m,
......@@ -163,7 +175,7 @@ def _try_get_asm_config(
model_dim=k,
expert=e,
topk=top_k,
quant_type=AsmMoeQuantType.F8_W8A8,
quant_type=asm_quant_type,
)
if solution == "default":
return None
......@@ -208,6 +220,7 @@ def _try_get_triton_config(
MoeQuantType.W4A16: "int4_w4a16",
MoeQuantType.W8A8: "int8_w8a8",
MoeQuantType.FP8_W8A8: "fp8_w8a8",
MoeQuantType.INT8_W8A16: "int8_w8a16",
}.get(quant_type)
if dtype_name is None:
return None
......@@ -238,18 +251,22 @@ def _try_get_ck_config(
block_shape: Optional[List[int]],
) -> Optional[Dict[str, Any]]:
try:
if quant_type not in (MoeQuantType.W8A8, MoeQuantType.FP8_W8A8):
return None
from .fused_moe_ck import get_moe_ck_solution_id, MoeQuantType as CkMoeQuantType
from .jit.utils.chip_info import get_gfx
if quant_type == MoeQuantType.W16A16:
ck_quant_type = CkMoeQuantType.NO_QUANT
elif quant_type == MoeQuantType.W8A8 or quant_type == MoeQuantType.FP8_W8A8:
ck_quant_type = CkMoeQuantType.INT8_W8A8
else:
return None
arch = get_gfx()
q_size_n = block_shape[0] if block_shape is not None else 0
q_size_k = block_shape[1] if block_shape is not None else 0
solution_id = get_moe_ck_solution_id(
arch,
CkMoeQuantType.INT8_W8A8,
ck_quant_type,
m,
n,
k,
......@@ -274,7 +291,7 @@ def get_aiter_moe_config(
block_size: int,
dtype: torch.dtype,
quant_type: str,
activation: str = "silu", # "silu"/"gelu"/"relu2"/...
activation: str = "silu", # "silu"/"gelu"/"relu2"/"swigluoai"/"swiglustep"...
gated: Optional[bool] = None, # True=GLU-gated (N1=2*inter), False=non-gated (N1=inter); None=auto from activation
) -> Tuple[bool, AiterMoeConfig]:
"""Get the best backend config for a MOE problem.
......@@ -285,6 +302,7 @@ def get_aiter_moe_config(
- ``MoeQuantType.W8A8`` (int8)
- ``MoeQuantType.FP8_W8A8`` (fp8)
- ``MoeQuantType.W4A8``
- ``MoeQuantType.INT8_W8A16`` (int8 weight, fp16/bf16 activation)
Backend priority:
- ``w16a16``: asm > triton
......@@ -292,6 +310,8 @@ def get_aiter_moe_config(
- ``w8a8``: asm > moe_c > triton > ck
- ``fp8_w8a8``: asm > moe_c > triton > ck
- ``w4a8``: moe_c
- ``int8_w8a16``: moe_c > triton (ASM kernel not available)
- ``fp8_w8a16``: not yet implemented (raises NotImplementedError)
For non-gated MOE (e.g. Nemotron with ReLU² activation), pass
``gated=False`` (or let it auto-detect from ``activation="relu2"``)
......@@ -299,7 +319,7 @@ def get_aiter_moe_config(
"""
# Determine gating: explicit > auto-detect from activation
if gated is None:
gated = activation in ("silu", "gelu")
gated = activation in ("silu", "gelu", "swigluoai", "swiglustep")
# For gated (GLU): N1 = 2 * intermediate_size, n = N1 // 2
# For non-gated: N1 = intermediate_size, n = N1
......@@ -310,17 +330,20 @@ def get_aiter_moe_config(
if dtype == torch.float16:
candidates = [
(MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
]
elif dtype == torch.bfloat16:
candidates = [
(MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, block_size)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
(MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)),
]
else:
raise ValueError(f"Unsupported dtype: {dtype}")
elif quant_type in (MoeQuantType.W8A8, MoeQuantType.FP8_W8A8):
if block_size == 0: # Channel wise choose MOE_C
if block_size is None or block_size == 0: # Channel wise
candidates = [
(MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, block_size)),
(MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
# (MoeSolutionType.CK, lambda: _try_get_ck_config(quant_type, M, E, n, K, top_k, block_shape)),
......@@ -328,17 +351,31 @@ def get_aiter_moe_config(
else: # Block wise choose ASM
candidates = [
(MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, block_size)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
]
elif quant_type == MoeQuantType.W4A8:
candidates = [
(MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
# (MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k)),
]
elif quant_type == MoeQuantType.INT8_W8A16:
# ASM backend currently has no W8A16 kernel/CSV; skip ASM and use moe_c -> triton.
candidates = [
(MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
]
elif quant_type == MoeQuantType.FP8_W8A16:
# No backend currently implements FP8 weight + 16-bit activation MoE.
raise NotImplementedError(
"MoeQuantType.FP8_W8A16 is not yet supported by any aiter MOE backend (asm/moe_c/triton)."
)
elif quant_type == MoeQuantType.W16A16:
candidates = [
(MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, None)),
(MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)),
# (MoeSolutionType.CK, lambda: _try_get_ck_config(quant_type, M, E, n, K, top_k, block_shape)),
]
else:
raise ValueError(f"Unsupported quant_type: {quant_type}")
......@@ -374,6 +411,10 @@ def aiter_moe(
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
routed_scaling_factor: Optional[float] = 1.0,
use_weight_shuffle: bool = False,
output_dtype: Optional[torch.dtype] = None,
gemm1_alpha: Optional[float] = None,
gemm1_limit: Optional[float] = None
) -> torch.Tensor:
"""Execute MOE using the backend and quant type described by *moe_config*."""
if moe_config.solution_type is None or moe_config.quant_type is None:
......@@ -381,11 +422,15 @@ def aiter_moe(
"moe_config has no valid solution_type/quant_type. "
"Call get_aiter_moe_config first and check the status."
)
if output_dtype is None:
output_dtype = hidden_states.dtype
use_int4_w4a16 = moe_config.quant_type == MoeQuantType.W4A16
use_int8_w8a8 = moe_config.quant_type == MoeQuantType.W8A8
use_fp8_w8a8 = moe_config.quant_type == MoeQuantType.FP8_W8A8
use_int8_w4a8 = moe_config.quant_type == MoeQuantType.W4A8
use_int8_w8a16 = moe_config.quant_type == MoeQuantType.INT8_W8A16
if moe_config.solution_type == MoeSolutionType.MOE_C:
from .fused_moe_c import moe_c_fused_experts
......@@ -401,6 +446,7 @@ def aiter_moe(
use_int8_w8a8=use_int8_w8a8,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w4a8=use_int8_w4a8,
use_int8_w8a16=use_int8_w8a16,
activation=activation,
global_num_experts=global_num_experts,
expert_map=expert_map,
......@@ -411,12 +457,14 @@ def aiter_moe(
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_shape,
routed_scaling_factor=routed_scaling_factor
routed_scaling_factor=routed_scaling_factor,
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit
)
if moe_config.solution_type == MoeSolutionType.ASM:
from .fused_moe_asm_wna16 import fused_experts_asm_impl
per_channel_quant = True if block_shape is None else False
cfg = moe_config.config
solution_id = f"{cfg['SOL_ID1']}+{cfg['SOL_ID2']}"
return fused_experts_asm_impl(
......@@ -425,12 +473,13 @@ def aiter_moe(
w2,
topk_weights,
topk_ids,
dtype=hidden_states.dtype,
dtype=output_dtype,
inplace=inplace,
use_int4_w4a16=use_int4_w4a16,
use_int8_w8a8=use_int8_w8a8,
use_fp8_w8a8=use_fp8_w8a8,
activation=activation,
per_channel_quant = per_channel_quant,
global_num_experts=global_num_experts,
expert_map=expert_map,
w1_scale=w1_scale,
......@@ -440,15 +489,18 @@ def aiter_moe(
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_shape,
use_shuffle=use_weight_shuffle,
solution_id=solution_id,
routed_scaling_factor=routed_scaling_factor
routed_scaling_factor=routed_scaling_factor,
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit
)
if moe_config.solution_type == MoeSolutionType.TRITON:
from .ops.triton.fused_moe import fused_experts_impl
# W8A8 channel-wise (block_shape=None) requires per_channel_quant=True
per_channel_quant = (use_int8_w8a8 or use_fp8_w8a8) and block_shape is None
# W8A8 / W8A16 channel-wise (block_shape=None) requires per_channel_quant=True
per_channel_quant = (use_int8_w8a8 or use_fp8_w8a8 or use_int8_w8a16) and block_shape is None
return fused_experts_impl(
hidden_states,
......@@ -456,11 +508,12 @@ def aiter_moe(
w2,
topk_weights,
topk_ids,
odtype=hidden_states.dtype,
output_dtype=output_dtype,
inplace=inplace,
use_int4_w4a16=use_int4_w4a16,
use_int8_w8a8=use_int8_w8a8,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
activation=activation,
per_channel_quant=per_channel_quant,
global_num_experts=global_num_experts,
......@@ -472,7 +525,9 @@ def aiter_moe(
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_shape,
routed_scaling_factor=routed_scaling_factor
routed_scaling_factor=routed_scaling_factor,
gemm1_alpha=gemm1_alpha,
gemm1_limit=gemm1_limit
)
if moe_config.solution_type == MoeSolutionType.CK:
......@@ -485,7 +540,7 @@ def aiter_moe(
w2,
topk_weights,
topk_ids,
odtype=hidden_states.dtype,
odtype=output_dtype,
inplace=inplace,
use_int8_w8a8=use_int8_w8a8,
use_fp8_w8a8=use_fp8_w8a8,
......@@ -499,6 +554,7 @@ def aiter_moe(
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_shape,
use_shuffle=use_weight_shuffle,
routed_scaling_factor=routed_scaling_factor,
solution_id=solution_id,
)
......
{
"1": {
"BLOCK_SIZE_M": 16,
"MODE": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"32": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"64": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"128": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"256": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"512": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"1024": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"2048": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"4096": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"8192": {
"BLOCK_SIZE_M": 16,
"MODE": 5
}
}
\ No newline at end of file
{
"1": {
"BLOCK_SIZE_M": 16,
"MODE": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"32": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"64": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"128": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"256": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"512": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"1024": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"2048": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"4096": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"8192": {
"BLOCK_SIZE_M": 16,
"MODE": 5
}
}
\ No newline at end of file
{
"1": {
"BLOCK_SIZE_M": 16,
"MODE": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"32": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"64": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"128": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"256": {
"BLOCK_SIZE_M": 16,
"MODE": 1
},
"512": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"1024": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"2048": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"4096": {
"BLOCK_SIZE_M": 16,
"MODE": 5
},
"8192": {
"BLOCK_SIZE_M": 16,
"MODE": 5
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment