"vllm/vscode:/vscode.git/clone" did not exist on "19fe1a0510852b5a892932653a5881e58e359660"
Commit 47bd229c authored by yangql's avatar yangql
Browse files

适配deepseekv3\v2 moe awq的推理支持

parent 4a734b9d
{
"7168_2048": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"7168_2304": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"7168_2304": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"7168_256": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"7168_256": {
"1": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
...@@ -277,6 +277,10 @@ class MoeWNA16Method(FusedMoEMethodBase): ...@@ -277,6 +277,10 @@ class MoeWNA16Method(FusedMoEMethodBase):
custom_routing_function: Optional[Callable] = None, custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax", scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None, e_score_correction_bias: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False,
moe_ep_size: Optional[int] = None,
start_expert: Optional[int] = None,
end_expert: Optional[int] = None,
) -> torch.Tensor: ) -> torch.Tensor:
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
...@@ -307,7 +311,9 @@ class MoeWNA16Method(FusedMoEMethodBase): ...@@ -307,7 +311,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
w2_scale=layer.w2_scales, w2_scale=layer.w2_scales,
w1_zp=layer.w13_qzeros if has_zp else None, w1_zp=layer.w13_qzeros if has_zp else None,
w2_zp=layer.w2_qzeros if has_zp else None, w2_zp=layer.w2_qzeros if has_zp else None,
block_shape=[0, layer.group_size]) block_shape=[0, layer.group_size],
use_nn_moe=False,
)
@staticmethod @staticmethod
def get_weight_loader(layer, weight_loader): def get_weight_loader(layer, weight_loader):
......
...@@ -27,7 +27,7 @@ from torch import nn ...@@ -27,7 +27,7 @@ from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
import os import os
import re import re
import vllm.envs as envs
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
...@@ -517,9 +517,11 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -517,9 +517,11 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
weight.data.copy_(_weight) weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1], -1) weight.data=weight.data.reshape(ori_shape[1], -1)
else:
if self.quant_method == "awq":
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
os.environ['LLAMA_NN'] = '0'
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [ lay_key_words = [
"self_attn.W_pack.qweight", "self_attn.W_pack.qweight",
"self_attn.o_proj.qweight", "self_attn.o_proj.qweight",
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
import os import os
import re import re
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
import vllm.envs as envs
import torch import torch
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
...@@ -666,8 +666,17 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): ...@@ -666,8 +666,17 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
# 暂时awq不支持cutlass
envs.VLLM_USE_TRITON_AWQ = True
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
self.quant_method = None
if quant_config is not None:
self.quant_method = quant_config.get_name()
os.environ['LLAMA_NN'] = '0'
os.environ['LM_NN'] = '0'
self.config = config self.config = config
self.quant_config = quant_config self.quant_config = quant_config
self.parallel_config = vllm_config.parallel_config self.parallel_config = vllm_config.parallel_config
...@@ -683,13 +692,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): ...@@ -683,13 +692,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
self.sampler = get_sampler() self.sampler = get_sampler()
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors) self.model.make_empty_intermediate_tensors)
self.quant_method = None
if quant_config is not None:
self.quant_method=quant_config.get_name()
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1' self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids) return self.model.get_input_embeddings(input_ids)
...@@ -870,6 +875,53 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): ...@@ -870,6 +875,53 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
weight.data.copy_(_weight) weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1],-1) weight.data=weight.data.reshape(ori_shape[1],-1)
# 暂时不支持TN
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [
"self_attn.q_a_proj.qweight",
"self_attn.q_b_proj.qweight",
"self_attn.kv_a_proj_with_mqa.qweight",
"self_attn.kv_b_proj.qweight",
"self_attn.o_proj.qweight",
"mlp.gate_up_proj.qweight",
"mlp.down_proj.qweight",
"mlp.shared_experts.gate_up_proj.qweight",
"mlp.shared_experts.down_proj.qweight"
]
combined_words = "|".join(lay_key_words)
for layername in loaded_params:
weight = params_dict[layername]
matches = re.findall(combined_words, layername)
if matches:
qweight =params_dict[layername]
qzeros=params_dict[layername.replace("qweight", "qzeros")]
scales=params_dict[layername.replace("qweight", "scales")]
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size= self.quant_config.group_size
dim_n = scales.data.shape[1]
dim_k = qweight.data.shape[0]
pad_group=2
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse.data.copy_(sz)
qweight.data.copy_(_qw)
#reshape
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
return loaded_params return loaded_params
......
...@@ -29,7 +29,7 @@ from torch import nn ...@@ -29,7 +29,7 @@ from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
import os import os
import re import re
import vllm.envs as envs
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
...@@ -505,9 +505,11 @@ class LlamaModel(nn.Module): ...@@ -505,9 +505,11 @@ class LlamaModel(nn.Module):
weight.data.copy_(_weight) weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1], -1) weight.data=weight.data.reshape(ori_shape[1], -1)
else:
if self.quant_method == "awq":
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
os.environ['LLAMA_NN'] = '0'
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [ lay_key_words = [
"self_attn.qkv_proj.qweight", "self_attn.qkv_proj.qweight",
"self_attn.o_proj.qweight", "self_attn.o_proj.qweight",
...@@ -551,7 +553,6 @@ class LlamaModel(nn.Module): ...@@ -551,7 +553,6 @@ class LlamaModel(nn.Module):
#当为triton支持推理的时候不能进行处理 #当为triton支持推理的时候不能进行处理
if self.quant_method == "compressed_tensors": if self.quant_method == "compressed_tensors":
os.environ['LM_NN'] = '0'
lay_key_words = [ lay_key_words = [
"self_attn.qkv_proj.weight", "self_attn.qkv_proj.weight",
"self_attn.o_proj.weight", "self_attn.o_proj.weight",
......
...@@ -30,7 +30,7 @@ from torch import nn ...@@ -30,7 +30,7 @@ from torch import nn
from transformers import Qwen2Config from transformers import Qwen2Config
import os import os
import re import re
import vllm.envs as envs
from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention import Attention, AttentionMetadata, AttentionType
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
...@@ -483,9 +483,11 @@ class Qwen2Model(nn.Module): ...@@ -483,9 +483,11 @@ class Qwen2Model(nn.Module):
weight.data.copy_(_weight) weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1],-1) weight.data=weight.data.reshape(ori_shape[1],-1)
else:
if self.quant_method == "awq":
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
os.environ['LLAMA_NN'] = '0'
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [ lay_key_words = [
"self_attn.qkv_proj.qweight", "self_attn.qkv_proj.qweight",
"self_attn.o_proj.qweight", "self_attn.o_proj.qweight",
...@@ -528,7 +530,6 @@ class Qwen2Model(nn.Module): ...@@ -528,7 +530,6 @@ class Qwen2Model(nn.Module):
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous() qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
if self.quant_method == "compressed_tensors": if self.quant_method == "compressed_tensors":
os.environ['LM_NN'] = '0'
lay_key_words = [ lay_key_words = [
"self_attn.qkv_proj.weight", "self_attn.qkv_proj.weight",
"self_attn.o_proj.weight", "self_attn.o_proj.weight",
......
...@@ -72,7 +72,7 @@ class RocmPlatform(Platform): ...@@ -72,7 +72,7 @@ class RocmPlatform(Platform):
supported_quantization: list[str] = [ supported_quantization: list[str] = [
"awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
"fbgemm_fp8", "gguf", "quark" "fbgemm_fp8", "gguf", "quark", "moe_wna16"
] ]
@classmethod @classmethod
...@@ -157,8 +157,8 @@ class RocmPlatform(Platform): ...@@ -157,8 +157,8 @@ class RocmPlatform(Platform):
if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ: if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
logger.warning( logger.warning(
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
" is not set, enabling VLLM_USE_TRITON_AWQ.") " is not set, disabling VLLM_USE_TRITON_AWQ.")
envs.VLLM_USE_TRITON_AWQ = True envs.VLLM_USE_TRITON_AWQ = False
@classmethod @classmethod
def get_punica_wrapper(cls) -> str: def get_punica_wrapper(cls) -> str:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment