Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
475dcaa0
Commit
475dcaa0
authored
Jan 15, 2026
by
yangql
Browse files
修复deepseek moe模型的awq量化推理bug和精度问题
parent
efd51772
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
3 deletions
+16
-3
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+4
-1
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+4
-0
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+8
-2
No files found.
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
475dcaa0
...
@@ -11,7 +11,10 @@ from vllm.config import CompilationLevel, get_current_vllm_config
...
@@ -11,7 +11,10 @@ from vllm.config import CompilationLevel, get_current_vllm_config
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm.utils
import
W8a8GetCacheJSON
from
lmslim.layers.gemm.int8_utils
import
per_token_quant_int8
from
lmslim.layers.gemm.int8_utils
import
per_token_quant_int8
from
lmslim.layers.gemm.fp8_utils
import
triton_scaled_mm_fp8
try
:
from
lmslim.layers.gemm.fp8_utils
import
triton_scaled_mm_fp8
except
Exception
:
print
(
"INFO: Please updata lmslim if you want to use fp8_utils.
\n
"
)
# Input scaling factors are no longer optional in _scaled_mm starting
# Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
TORCH_DEVICE_IDENTITY
=
None
TORCH_DEVICE_IDENTITY
=
None
...
...
vllm/model_executor/model_loader/utils.py
View file @
475dcaa0
...
@@ -288,6 +288,10 @@ def get_model_architecture(
...
@@ -288,6 +288,10 @@ def get_model_architecture(
os
.
environ
[
'FA_PAD'
]
=
'0'
os
.
environ
[
'FA_PAD'
]
=
'0'
else
:
else
:
if
architectures
in
[[
'DeepseekV3ForCausalLM'
],
[
'DeepSeekMTPModel'
]]:
if
architectures
in
[[
'DeepseekV3ForCausalLM'
],
[
'DeepSeekMTPModel'
]]:
#针对使用dtype为fp16的情况的量化默认关闭"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
if
model_config
.
quantization
in
{
"awq"
,
"awq_marlin"
,
"moe_wna16"
}:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD'
]
=
'0'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP"
):
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP"
):
os
.
environ
[
'VLLM_USE_LIGHTOP'
]
=
'1'
os
.
environ
[
'VLLM_USE_LIGHTOP'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
):
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
):
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
475dcaa0
...
@@ -385,9 +385,12 @@ class DeepseekV2MoE(nn.Module):
...
@@ -385,9 +385,12 @@ class DeepseekV2MoE(nn.Module):
# Fix FP16 overflow
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
# See DeepseekV2DecoderLayer for more details.
# fp16 mode not fused quant
# fp16 mode not fused quant
if
i_q
is
not
None
:
i_q
=
iqis
[
0
]
i_s
=
iqis
[
1
]
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
,
router_logits
=
router_logits
,
i_q
=
iq
is
[
0
]
,
i_s
=
i
qis
[
1
]
)
i_q
=
i
_
q
,
i_s
=
i
_s
)
if
shared_output
is
not
None
:
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
...
@@ -429,9 +432,12 @@ class DeepseekV2MoE(nn.Module):
...
@@ -429,9 +432,12 @@ class DeepseekV2MoE(nn.Module):
assert
shared_output
is
not
None
assert
shared_output
is
not
None
final_hidden_states
+=
(
shared_output
*
(
1.
/
self
.
routed_scaling_factor
))
final_hidden_states
+=
(
shared_output
*
(
1.
/
self
.
routed_scaling_factor
))
else
:
else
:
if
i_q
is
not
None
:
i_q
=
iqis
[
0
]
i_s
=
iqis
[
1
]
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
,
router_logits
=
router_logits
,
i_q
=
iq
is
[
0
]
,
i_s
=
i
qis
[
1
]
)
i_q
=
i
_
q
,
i_s
=
i
_s
)
if
shared_output
is
not
None
:
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment