Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
47bd229c
Commit
47bd229c
authored
Feb 20, 2025
by
yangql
Browse files
适配deepseekv3\v2 moe awq的推理支持
parent
4a734b9d
Changes
31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1304 additions
and
22 deletions
+1304
-22
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
...ayers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
.../layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
...ayers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
...r/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
...layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/moe_wna16.py
+7
-1
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+5
-3
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+59
-7
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+5
-4
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+5
-4
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+3
-3
No files found.
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
0 → 100644
View file @
47bd229c
{
"7168_2048"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
0 → 100644
View file @
47bd229c
This diff is collapsed.
Click to expand it.
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
0 → 100644
View file @
47bd229c
This diff is collapsed.
Click to expand it.
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
0 → 100644
View file @
47bd229c
This diff is collapsed.
Click to expand it.
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
0 → 100644
View file @
47bd229c
This diff is collapsed.
Click to expand it.
vllm/model_executor/layers/quantization/moe_wna16.py
View file @
47bd229c
...
...
@@ -277,6 +277,10 @@ class MoeWNA16Method(FusedMoEMethodBase):
custom_routing_function
:
Optional
[
Callable
]
=
None
,
scoring_func
:
str
=
"softmax"
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
moe_ep_size
:
Optional
[
int
]
=
None
,
start_expert
:
Optional
[
int
]
=
None
,
end_expert
:
Optional
[
int
]
=
None
,
)
->
torch
.
Tensor
:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
...
...
@@ -307,7 +311,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
w2_scale
=
layer
.
w2_scales
,
w1_zp
=
layer
.
w13_qzeros
if
has_zp
else
None
,
w2_zp
=
layer
.
w2_qzeros
if
has_zp
else
None
,
block_shape
=
[
0
,
layer
.
group_size
])
block_shape
=
[
0
,
layer
.
group_size
],
use_nn_moe
=
False
,
)
@
staticmethod
def
get_weight_loader
(
layer
,
weight_loader
):
...
...
vllm/model_executor/models/baichuan.py
View file @
47bd229c
...
...
@@ -27,7 +27,7 @@ from torch import nn
from
transformers
import
PretrainedConfig
import
os
import
re
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -517,9 +517,11 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
else
:
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.W_pack.qweight"
,
"self_attn.o_proj.qweight"
,
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
47bd229c
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/llama.py
View file @
47bd229c
...
...
@@ -29,7 +29,7 @@ from torch import nn
from
transformers
import
LlamaConfig
import
os
import
re
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -505,9 +505,11 @@ class LlamaModel(nn.Module):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
else
:
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.qkv_proj.qweight"
,
"self_attn.o_proj.qweight"
,
...
...
@@ -551,7 +553,6 @@ class LlamaModel(nn.Module):
#当为triton支持推理的时候不能进行处理
if
self
.
quant_method
==
"compressed_tensors"
:
os
.
environ
[
'LM_NN'
]
=
'0'
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
...
...
vllm/model_executor/models/qwen2.py
View file @
47bd229c
...
...
@@ -30,7 +30,7 @@ from torch import nn
from
transformers
import
Qwen2Config
import
os
import
re
import
vllm.envs
as
envs
from
vllm.attention
import
Attention
,
AttentionMetadata
,
AttentionType
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
...
...
@@ -483,9 +483,11 @@ class Qwen2Model(nn.Module):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
else
:
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.qkv_proj.qweight"
,
"self_attn.o_proj.qweight"
,
...
...
@@ -528,7 +530,6 @@ class Qwen2Model(nn.Module):
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
if
self
.
quant_method
==
"compressed_tensors"
:
os
.
environ
[
'LM_NN'
]
=
'0'
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
...
...
vllm/platforms/rocm.py
View file @
47bd229c
...
...
@@ -72,7 +72,7 @@ class RocmPlatform(Platform):
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"gptq"
,
"fp8"
,
"compressed_tensors"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"quark"
"fbgemm_fp8"
,
"gguf"
,
"quark"
,
"moe_wna16"
]
@
classmethod
...
...
@@ -157,8 +157,8 @@ class RocmPlatform(Platform):
if
quant
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
logger
.
warning
(
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
" is not set,
en
abling VLLM_USE_TRITON_AWQ."
)
envs
.
VLLM_USE_TRITON_AWQ
=
Tru
e
" is not set,
dis
abling VLLM_USE_TRITON_AWQ."
)
envs
.
VLLM_USE_TRITON_AWQ
=
Fals
e
@
classmethod
def
get_punica_wrapper
(
cls
)
->
str
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment