Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
47bd229c
Commit
47bd229c
authored
Feb 20, 2025
by
yangql
Browse files
适配deepseekv3\v2 moe awq的推理支持
parent
4a734b9d
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1304 additions
and
22 deletions
+1304
-22
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
...ayers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
.../layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
...ayers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
...r/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
...layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/moe_wna16.py
+7
-1
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+5
-3
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+59
-7
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+5
-4
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+5
-4
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+3
-3
No files found.
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
0 → 100644
View file @
47bd229c
{
"7168_2048"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
0 → 100644
View file @
47bd229c
{
"7168_2304"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
0 → 100644
View file @
47bd229c
{
"7168_2304"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
0 → 100644
View file @
47bd229c
{
"7168_256"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
0 → 100644
View file @
47bd229c
{
"7168_256"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/moe_wna16.py
View file @
47bd229c
...
...
@@ -277,6 +277,10 @@ class MoeWNA16Method(FusedMoEMethodBase):
custom_routing_function
:
Optional
[
Callable
]
=
None
,
scoring_func
:
str
=
"softmax"
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
moe_ep_size
:
Optional
[
int
]
=
None
,
start_expert
:
Optional
[
int
]
=
None
,
end_expert
:
Optional
[
int
]
=
None
,
)
->
torch
.
Tensor
:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
...
...
@@ -307,7 +311,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
w2_scale
=
layer
.
w2_scales
,
w1_zp
=
layer
.
w13_qzeros
if
has_zp
else
None
,
w2_zp
=
layer
.
w2_qzeros
if
has_zp
else
None
,
block_shape
=
[
0
,
layer
.
group_size
])
block_shape
=
[
0
,
layer
.
group_size
],
use_nn_moe
=
False
,
)
@
staticmethod
def
get_weight_loader
(
layer
,
weight_loader
):
...
...
vllm/model_executor/models/baichuan.py
View file @
47bd229c
...
...
@@ -27,7 +27,7 @@ from torch import nn
from
transformers
import
PretrainedConfig
import
os
import
re
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -517,9 +517,11 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
else
:
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.W_pack.qweight"
,
"self_attn.o_proj.qweight"
,
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
47bd229c
...
...
@@ -25,7 +25,7 @@
import
os
import
re
from
typing
import
Any
,
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Union
import
vllm.envs
as
envs
import
torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
...
...
@@ -666,8 +666,17 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
# 暂时awq不支持cutlass
envs
.
VLLM_USE_TRITON_AWQ
=
True
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
quant_method
=
None
if
quant_config
is
not
None
:
self
.
quant_method
=
quant_config
.
get_name
()
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'LM_NN'
]
=
'0'
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
parallel_config
=
vllm_config
.
parallel_config
...
...
@@ -683,13 +692,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
quant_method
=
None
if
quant_config
is
not
None
:
self
.
quant_method
=
quant_config
.
get_name
()
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
...
...
@@ -870,6 +875,53 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
# 暂时不支持TN
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.q_a_proj.qweight"
,
"self_attn.q_b_proj.qweight"
,
"self_attn.kv_a_proj_with_mqa.qweight"
,
"self_attn.kv_b_proj.qweight"
,
"self_attn.o_proj.qweight"
,
"mlp.gate_up_proj.qweight"
,
"mlp.down_proj.qweight"
,
"mlp.shared_experts.gate_up_proj.qweight"
,
"mlp.shared_experts.down_proj.qweight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
in
loaded_params
:
weight
=
params_dict
[
layername
]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
qweight
=
params_dict
[
layername
]
qzeros
=
params_dict
[
layername
.
replace
(
"qweight"
,
"qzeros"
)]
scales
=
params_dict
[
layername
.
replace
(
"qweight"
,
"scales"
)]
zeros_and_scalse
=
params_dict
[
layername
.
replace
(
"qweight"
,
"zeros_and_scales"
)]
group_size
=
self
.
quant_config
.
group_size
dim_n
=
scales
.
data
.
shape
[
1
]
dim_k
=
qweight
.
data
.
shape
[
0
]
pad_group
=
2
_qw
,
_sz
=
ops
.
convert_s4
(
qweight
,
qzeros
,
scales
,
int
(
group_size
))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
zeros_and_scalse
.
data
.
copy_
(
sz
)
qweight
.
data
.
copy_
(
_qw
)
#reshape
zeros_and_scalse
.
data
=
zeros_and_scalse
.
reshape
(
dim_n
,
-
1
)
#[k/greop_size,n]------>[n,k/group_size]
qweight
.
data
=
qweight
.
data
.
reshape
(
dim_n
,
-
1
)
#[k,n/8]---->[n,k/8]
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
zeros_and_scalse
.
data
=
torch
.
cat
((
zeros_and_scalse
.
data
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
return
loaded_params
...
...
vllm/model_executor/models/llama.py
View file @
47bd229c
...
...
@@ -29,7 +29,7 @@ from torch import nn
from
transformers
import
LlamaConfig
import
os
import
re
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -505,9 +505,11 @@ class LlamaModel(nn.Module):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
else
:
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.qkv_proj.qweight"
,
"self_attn.o_proj.qweight"
,
...
...
@@ -551,7 +553,6 @@ class LlamaModel(nn.Module):
#当为triton支持推理的时候不能进行处理
if
self
.
quant_method
==
"compressed_tensors"
:
os
.
environ
[
'LM_NN'
]
=
'0'
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
...
...
vllm/model_executor/models/qwen2.py
View file @
47bd229c
...
...
@@ -30,7 +30,7 @@ from torch import nn
from
transformers
import
Qwen2Config
import
os
import
re
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionMetadata
,
AttentionType
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -483,9 +483,11 @@ class Qwen2Model(nn.Module):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
else
:
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.qkv_proj.qweight"
,
"self_attn.o_proj.qweight"
,
...
...
@@ -528,7 +530,6 @@ class Qwen2Model(nn.Module):
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
if
self
.
quant_method
==
"compressed_tensors"
:
os
.
environ
[
'LM_NN'
]
=
'0'
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
...
...
vllm/platforms/rocm.py
View file @
47bd229c
...
...
@@ -72,7 +72,7 @@ class RocmPlatform(Platform):
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"gptq"
,
"fp8"
,
"compressed_tensors"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"quark"
"fbgemm_fp8"
,
"gguf"
,
"quark"
,
"moe_wna16"
]
@
classmethod
...
...
@@ -157,8 +157,8 @@ class RocmPlatform(Platform):
if
quant
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
logger
.
warning
(
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
" is not set,
en
abling VLLM_USE_TRITON_AWQ."
)
envs
.
VLLM_USE_TRITON_AWQ
=
Tru
e
" is not set,
dis
abling VLLM_USE_TRITON_AWQ."
)
envs
.
VLLM_USE_TRITON_AWQ
=
Fals
e
@
classmethod
def
get_punica_wrapper
(
cls
)
->
str
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment