Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
706c031c
Commit
706c031c
authored
Mar 09, 2026
by
laibao
Browse files
moe: 补齐 fill+moe_align 融合开关语义
parent
5a14b60c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
2 deletions
+21
-2
vllm/envs.py
vllm/envs.py
+5
-0
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+5
-2
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+11
-0
No files found.
vllm/envs.py
View file @
706c031c
...
...
@@ -306,6 +306,7 @@ if TYPE_CHECKING:
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
:
bool
=
False
VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE
:
bool
=
False
VLLM_USE_FUSED_DTBMM
:
bool
=
False
# DOUBLE TRANS BMM FP8
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
:
bool
=
False
VLLM_USE_CUDA_GRAPH_SIZES
:
bool
=
False
...
...
@@ -1910,6 +1911,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
).
lower
()
in
(
"true"
,
"1"
)
),
# vLLM will use lightop fill + moe_align_block_size
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
#If set to 1/True, enable fuse split qkv+rmsnorm+rope+kv update just like glm4.7 moe attention.
"VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE"
:
...
...
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
View file @
706c031c
...
...
@@ -92,6 +92,8 @@ def moe_align_block_size(
sorted_ids
=
torch
.
empty
(
(
max_num_tokens_padded
,),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
if
not
envs
.
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
:
sorted_ids
.
fill_
(
topk_ids
.
numel
())
max_num_m_blocks
=
triton
.
cdiv
(
max_num_tokens_padded
,
block_size
)
if
expert_map
is
not
None
:
expert_ids
=
torch
.
zeros
(
...
...
@@ -102,6 +104,7 @@ def moe_align_block_size(
(
max_num_m_blocks
,),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
num_tokens_post_pad
=
torch
.
empty
((
1
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
is_fuse_fill
=
envs
.
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
if
envs
.
VLLM_USE_LIGHTOP
or
expert_mask
is
not
None
:
from
lightop
import
op
as
op
...
...
@@ -115,7 +118,7 @@ def moe_align_block_size(
expert_map
=
expert_map
,
expert_mask
=
expert_mask
,
num_local_tokens
=
None
,
Is_fuse_fill
=
True
,
Is_fuse_fill
=
is_fuse_fill
,
)
else
:
if
envs
.
VLLM_USE_LIGHTOP_MOE_ALIGN
:
...
...
@@ -130,7 +133,7 @@ def moe_align_block_size(
expert_map
=
None
,
expert_mask
=
None
,
num_local_tokens
=
None
,
Is_fuse_fill
=
True
,
Is_fuse_fill
=
is_fuse_fill
,
)
else
:
ops
.
moe_align_block_size
(
...
...
vllm/model_executor/model_loader/utils.py
View file @
706c031c
...
...
@@ -194,6 +194,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_FUSED_FILL_RMS_CAT"
):
os
.
environ
[
'VLLM_USE_FUSED_FILL_RMS_CAT'
]
=
'1'
if
model_config
.
quantization
in
{
"slimquant_w4a8"
,
"slimquant_w4a8_marlin"
,
"slimquant_compressed_tensors_marlin"
,
"compressed-tensors"
}:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'
]
=
'1'
# if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
# if not envs.is_set("USE_FUSED_RMS_QUANT"):
# os.environ['USE_FUSED_RMS_QUANT'] = '1'
...
...
@@ -205,6 +208,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
if
architectures
in
[[
'Qwen3MoeForCausalLM'
]]:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_ALIGN'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_FUSE_SILU_AND_MUL"
):
...
...
@@ -231,6 +236,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_FUSED_FILL_RMS_CAT"
):
os
.
environ
[
'VLLM_USE_FUSED_FILL_RMS_CAT'
]
=
'1'
if
model_config
.
quantization
in
{
"slimquant_w4a8"
,
"slimquant_w4a8_marlin"
,
"slimquant_compressed_tensors_marlin"
,
"compressed-tensors"
}:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'
]
=
'1'
# if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
# if not envs.is_set("USE_FUSED_RMS_QUANT"):
# os.environ['USE_FUSED_RMS_QUANT'] = '1'
...
...
@@ -242,6 +250,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
if
architectures
in
[[
'Qwen3MoeForCausalLM'
]]:
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_ALIGN'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_LIGHTOP_MOE_SUM"
):
os
.
environ
[
'VLLM_USE_LIGHTOP_MOE_SUM'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_FUSE_SILU_AND_MUL"
):
...
...
@@ -307,6 +317,7 @@ def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
model_config
.
runner_type
,
model_config
.
trust_remote_code
,
model_config
.
model_impl
,
model_config
.
quantization
,
tuple
(
getattr
(
model_config
.
hf_config
,
"architectures"
,
[])),
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment