Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
03cdc9be
Commit
03cdc9be
authored
Nov 27, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.9.2-dev
parents
4c167900
a6bed85b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
66 additions
and
35 deletions
+66
-35
vllm/envs.py
vllm/envs.py
+4
-4
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+61
-31
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+1
-0
No files found.
vllm/envs.py
View file @
03cdc9be
...
...
@@ -172,8 +172,8 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_MOE_SUM
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
Tru
e
USE_FUSED_SILU_MUL_QUANT
:
bool
=
Tru
e
USE_FUSED_RMS_QUANT
:
bool
=
Fals
e
USE_FUSED_SILU_MUL_QUANT
:
bool
=
Fals
e
VLLM_P2P_ASYNC
:
bool
=
False
VLLM_P2P_BUF_TOKENS
:
int
=
30000
VLLM_SCHED_ENABLE_MINIMAL_INJECTION
:
bool
=
False
...
...
@@ -1158,12 +1158,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm will use rmsquant fused op
"USE_FUSED_RMS_QUANT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_RMS_QUANT"
,
"
1
"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_RMS_QUANT"
,
"
0
"
))),
# vllm will use silu_mul_quant fused op,
# This variable has a default value of true,
# but it is still controlled by CRQ and RQ.
"USE_FUSED_SILU_MUL_QUANT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_SILU_MUL_QUANT"
,
"
1
"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_SILU_MUL_QUANT"
,
"
0
"
))),
# vllm pd separation will be used async
"VLLM_P2P_ASYNC"
:
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
03cdc9be
...
...
@@ -377,7 +377,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
shared_output
:
Optional
[
torch
.
Tensor
]
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
1.0
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
**
_
)
->
torch
.
Tensor
:
if
enable_eplb
:
raise
NotImplementedError
(
...
...
@@ -1542,6 +1542,7 @@ class FusedMoE(torch.nn.Module):
hidden_states
,
router_logits
=
get_ep_group
().
dispatch
(
hidden_states
,
router_logits
)
if
envs
.
USE_FUSED_RMS_QUANT
:
# Matrix multiply.
final_hidden_states
=
self
.
quant_method
.
apply
(
layer
=
self
,
...
...
@@ -1570,6 +1571,32 @@ class FusedMoE(torch.nn.Module):
i_q
=
i_q
,
i_s
=
i_s
)
else
:
final_hidden_states
=
self
.
quant_method
.
apply
(
layer
=
self
,
x
=
hidden_states
,
router_logits
=
router_logits
,
top_k
=
self
.
top_k
,
renormalize
=
self
.
renormalize
,
use_grouped_topk
=
self
.
use_grouped_topk
,
global_num_experts
=
self
.
global_num_experts
,
expert_map
=
self
.
expert_map
,
topk_group
=
self
.
topk_group
,
num_expert_group
=
self
.
num_expert_group
,
custom_routing_function
=
self
.
custom_routing_function
,
scoring_func
=
self
.
scoring_func
,
e_score_correction_bias
=
self
.
e_score_correction_bias
,
activation
=
self
.
activation
,
apply_router_weight_on_input
=
self
.
apply_router_weight_on_input
,
enable_eplb
=
self
.
enable_eplb
,
expert_load_view
=
self
.
expert_load_view
,
logical_to_physical_map
=
self
.
logical_to_physical_map
,
logical_replica_count
=
self
.
logical_replica_count
,
shared_output
=
shared_output
,
use_nn_moe
=
self
.
use_nn_moe
,
routed_scaling_factor
=
self
.
routed_scaling_factor
,
use_fused_gate
=
self
.
use_fused_gate
,
)
if
do_naive_dispatch_combine
:
final_hidden_states
=
get_ep_group
().
combine
(
final_hidden_states
)
...
...
@@ -1645,8 +1672,11 @@ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
forward_context
:
ForwardContext
=
get_forward_context
()
self
=
forward_context
.
no_compile_layers
[
layer_name
]
assert
self
.
quant_method
is
not
None
if
envs
.
USE_FUSED_RMS_QUANT
:
return
self
.
forward_impl
(
hidden_states
,
router_logits
,
shared_output
,
i_q
,
i_s
)
else
:
return
self
.
forward_impl
(
hidden_states
,
router_logits
,
shared_output
)
def
moe_forward_fake
(
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
03cdc9be
...
...
@@ -576,6 +576,7 @@ class FlashAttentionImpl(AttentionImpl):
layer
.
_k_scale
,
layer
.
_v_scale
)
else
:
from
vllm.attention.utils.fa_utils
import
reshape_and_cache_cuda
reshape_and_cache_cuda
(
key
,
value
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment