Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c50f084a
Commit
c50f084a
authored
Nov 21, 2025
by
jujl1
Browse files
feat: pp mtp加入零消耗调度,加入环境变量VLLM_USE_ZERO_MTP,默认打开
parent
d126ce21
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
659 additions
and
9 deletions
+659
-9
vllm/envs.py
vllm/envs.py
+13
-8
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+646
-1
No files found.
vllm/envs.py
View file @
c50f084a
...
@@ -182,6 +182,7 @@ if TYPE_CHECKING:
...
@@ -182,6 +182,7 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
:
bool
=
False
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
:
bool
=
False
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
:
bool
=
False
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
:
bool
=
False
VLLM_USE_PP_BALANCE
:
bool
=
False
VLLM_USE_PP_BALANCE
:
bool
=
False
VLLM_USE_ZERO_MTP
:
bool
=
False
VLLM_USE_CUDA_GRAPH_SIZES
:
bool
=
False
VLLM_USE_CUDA_GRAPH_SIZES
:
bool
=
False
def
get_default_cache_root
():
def
get_default_cache_root
():
...
@@ -1152,40 +1153,44 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1152,40 +1153,44 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm pd separation will be used async
# vllm pd separation will be used async
"VLLM_P2P_ASYNC"
:
"VLLM_P2P_ASYNC"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_P2P_ASYNC"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_P2P_ASYNC"
,
"0"
))),
# pd separation p2p async buf tokens
# pd separation p2p async buf tokens
"VLLM_P2P_BUF_TOKENS"
:
"VLLM_P2P_BUF_TOKENS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_P2P_BUF_TOKENS"
,
"30000"
)),
lambda
:
int
(
os
.
getenv
(
"VLLM_P2P_BUF_TOKENS"
,
"30000"
)),
# vllm will enable minimal injection for pipeline parallel scheduling
# vllm will enable minimal injection for pipeline parallel scheduling
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
:
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
:
lambda
:
(
os
.
getenv
(
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
,
"0"
).
lower
()
in
lambda
:
(
os
.
getenv
(
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
,
"0"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will split prefill and decode, not mix up
# vLLM will split prefill and decode, not mix up
"VLLM_USE_PD_SPLIT"
:
"VLLM_USE_PD_SPLIT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will sync to avoid pp vmfault
# vLLM will sync to avoid pp vmfault
"VLLM_USE_PP_SYNC"
:
"VLLM_USE_PP_SYNC"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PP_SYNC"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PP_SYNC"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use lightop to fuse fill and moe align
# vLLM will use lightop to fuse fill and moe align
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
:
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vllm will use custom-allreduce rmsquant fused op
# vllm will use custom-allreduce rmsquant fused op
"USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT"
:
"USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT'
,
'0'
).
lower
()
in
lambda
:
(
os
.
getenv
(
'USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
"VLLM_USE_PP_BALANCE"
:
"VLLM_USE_PP_BALANCE"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_PP_BALANCE'
,
'1'
).
lower
()
in
lambda
:
(
os
.
getenv
(
'VLLM_USE_PP_BALANCE'
,
'1'
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
"VLLM_USE_ZERO_MTP"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_ZERO_MTP'
,
'1'
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use 1-18... (not only 1 2 4 8 16)
# vllm will use 1-18... (not only 1 2 4 8 16)
"VLLM_USE_CUDA_GRAPH_SIZES"
:
"VLLM_USE_CUDA_GRAPH_SIZES"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_CUDA_GRAPH_SIZES'
,
'False'
).
lower
()
in
lambda
:
(
os
.
getenv
(
'VLLM_USE_CUDA_GRAPH_SIZES'
,
'False'
).
lower
()
in
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
c50f084a
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment