Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7f54652e
Commit
7f54652e
authored
Oct 24, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev' into v0.9.2-step3v
parents
10a836fb
46dd30e7
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
20 additions
and
14 deletions
+20
-14
vllm/envs.py
vllm/envs.py
+5
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+1
-1
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+5
-4
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+9
-9
No files found.
vllm/envs.py
View file @
7f54652e
...
...
@@ -167,6 +167,7 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP
:
bool
=
False
VLLM_USE_OPT_CAT
:
bool
=
False
VLLM_USE_OPT_MOE_SUM
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_SUM
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
...
...
@@ -1111,6 +1112,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_OPT_MOE_SUM"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_OPT_MOE_SUM"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will use lightop moe_sum_mul_add
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will use lightop moe_sum
"VLLM_USE_LIGHTOP_MOE_SUM"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_SUM"
,
"True"
).
lower
()
in
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
7f54652e
...
...
@@ -1895,7 +1895,7 @@ def fused_experts_impl(
block_shape
=
block_shape
,
use_nn_moe
=
use_nn_moe
)
if
envs
.
VLLM_USE_LIGHTOP
:
if
envs
.
VLLM_USE_LIGHTOP
_MOE_SUM_MUL_ADD
:
from
lightop
import
op
as
op
op
.
moe_sum
(
input
=
intermediate_cache3
.
view
(
*
intermediate_cache3
.
size
()),
output
=
out_hidden_states
[
begin_chunk_idx
:
end_chunk_idx
],
bias
=
shared_output
[
begin_chunk_idx
:
end_chunk_idx
],
...
...
vllm/model_executor/model_loader/utils.py
View file @
7f54652e
...
...
@@ -247,6 +247,8 @@ def get_model_architecture(
if
architectures
in
[[
'DeepseekV3ForCausalLM'
],
[
'DeepSeekMTPModel'
]]:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP"
):
os
.
environ
[
'VLLM_USE_LIGHTOP'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_OPT_CAT"
):
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
...
...
@@ -258,12 +260,11 @@ def get_model_architecture(
if
architectures
in
[[
'DeepseekV3ForCausalLM'
],
[
'DeepSeekMTPModel'
]]:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP"
):
os
.
environ
[
'VLLM_USE_LIGHTOP'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_OPT_CAT"
):
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
if
not
envs
.
is_set
(
"USE_FUSED_RMS_QUANT"
):
os
.
environ
[
'USE_FUSED_RMS_QUANT'
]
=
'1'
if
not
envs
.
is_set
(
"USE_FUSED_SILU_MUL_QUANT"
):
os
.
environ
[
'USE_FUSED_SILU_MUL_QUANT'
]
=
'1'
# awq相关配置
try
:
if
os
.
getenv
(
'AWQ_MOE_SZ'
)
==
None
:
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
7f54652e
...
...
@@ -214,7 +214,7 @@ class DeepseekV2MoE(nn.Module):
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
envs
.
VLLM_USE_LIGHTOP
:
if
envs
.
VLLM_USE_LIGHTOP
_MOE_SUM_MUL_ADD
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
,
...
...
@@ -230,14 +230,14 @@ class DeepseekV2MoE(nn.Module):
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
if
self
.
tp_size
>
1
:
if
envs
.
VLLM_ENABLE_TBO
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment