Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f4b01cd4
Commit
f4b01cd4
authored
Oct 31, 2025
by
jujl1
Browse files
feat: pipeline_parallel新增pp域请求数均衡,VLLM_USE_PP_BALANCE控制,默认开启
parent
18a43696
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
37 additions
and
9 deletions
+37
-9
vllm/envs.py
vllm/envs.py
+11
-7
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+26
-2
No files found.
vllm/envs.py
View file @
f4b01cd4
...
@@ -177,6 +177,7 @@ if TYPE_CHECKING:
...
@@ -177,6 +177,7 @@ if TYPE_CHECKING:
VLLM_P2P_BUF_TOKENS
:
int
=
30000
VLLM_P2P_BUF_TOKENS
:
int
=
30000
VLLM_SCHED_ENABLE_MINIMAL_INJECTION
:
bool
=
False
VLLM_SCHED_ENABLE_MINIMAL_INJECTION
:
bool
=
False
VLLM_USE_PD_SPLIT
:
bool
=
False
VLLM_USE_PD_SPLIT
:
bool
=
False
VLLM_USE_PP_BALANCE
:
bool
=
False
def
get_default_cache_root
():
def
get_default_cache_root
():
return
os
.
getenv
(
return
os
.
getenv
(
...
@@ -1094,7 +1095,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1094,7 +1095,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_FLASH_ATTN_PA"
:
"VLLM_USE_FLASH_ATTN_PA"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_FLASH_ATTN_PA"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_FLASH_ATTN_PA"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use apex for rmsnorm
# vLLM will use apex for rmsnorm
"VLLM_USE_APEX_RN"
:
"VLLM_USE_APEX_RN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_APEX_RN"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_APEX_RN"
,
"False"
).
lower
()
in
...
@@ -1130,7 +1131,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1130,7 +1131,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vLLM will use lightop moe_align_block_size
# vLLM will use lightop moe_align_block_size
"VLLM_USE_LIGHTOP_MOE_ALIGN"
:
"VLLM_USE_LIGHTOP_MOE_ALIGN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_MOE_ALIGN"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use opt merge_aatn_states, not triton
# vLLM will use opt merge_aatn_states, not triton
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_MERGE_ATTN_STATES_OPT"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_MERGE_ATTN_STATES_OPT"
,
"True"
).
lower
()
in
...
@@ -1153,20 +1154,23 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1153,20 +1154,23 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm pd separation will be used async
# vllm pd separation will be used async
"VLLM_P2P_ASYNC"
:
"VLLM_P2P_ASYNC"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_P2P_ASYNC"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_P2P_ASYNC"
,
"0"
))),
# pd separation p2p async buf tokens
# pd separation p2p async buf tokens
"VLLM_P2P_BUF_TOKENS"
:
"VLLM_P2P_BUF_TOKENS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_P2P_BUF_TOKENS"
,
"30000"
)),
lambda
:
int
(
os
.
getenv
(
"VLLM_P2P_BUF_TOKENS"
,
"30000"
)),
# vllm will enable minimal injection for pipeline parallel scheduling
# vllm will enable minimal injection for pipeline parallel scheduling
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
:
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
:
lambda
:
(
os
.
getenv
(
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
,
"0"
).
lower
()
in
lambda
:
(
os
.
getenv
(
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
,
"0"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will split prefill and decode, not mix up
# vLLM will split prefill and decode, not mix up
"VLLM_USE_PD_SPLIT"
:
"VLLM_USE_PD_SPLIT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
"VLLM_USE_PP_BALANCE"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_PP_BALANCE'
,
'1'
).
lower
()
in
(
"true"
,
"1"
)),
}
}
# --8<-- [end:env-vars-definition]
# --8<-- [end:env-vars-definition]
...
...
vllm/v1/core/sched/scheduler.py
View file @
f4b01cd4
...
@@ -213,9 +213,17 @@ class Scheduler(SchedulerInterface):
...
@@ -213,9 +213,17 @@ class Scheduler(SchedulerInterface):
# First, schedule the RUNNING requests.
# First, schedule the RUNNING requests.
req_index
=
0
req_index
=
0
if
envs
.
VLLM_USE_PP_BALANCE
and
self
.
use_pp
:
pipeline_size
=
self
.
parallel_config
.
pipeline_parallel_size
max_batch_running
=
(
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
pipeline_size
-
1
)
//
pipeline_size
while
req_index
<
len
(
self
.
running
)
and
token_budget
>
0
:
while
req_index
<
len
(
self
.
running
)
and
token_budget
>
0
:
request
=
self
.
running
[
req_index
]
request
=
self
.
running
[
req_index
]
if
(
envs
.
VLLM_USE_PP_BALANCE
and
self
.
use_pp
and
len
(
scheduled_new_reqs
)
+
len
(
scheduled_resumed_reqs
)
+
len
(
scheduled_running_reqs
)
>=
max_batch_running
):
break
num_new_tokens
=
(
request
.
num_tokens_with_spec
-
num_new_tokens
=
(
request
.
num_tokens_with_spec
-
request
.
num_computed_tokens
)
request
.
num_computed_tokens
)
if
(
0
<
self
.
scheduler_config
.
long_prefill_token_threshold
<
if
(
0
<
self
.
scheduler_config
.
long_prefill_token_threshold
<
...
@@ -358,8 +366,13 @@ class Scheduler(SchedulerInterface):
...
@@ -358,8 +366,13 @@ class Scheduler(SchedulerInterface):
if
len
(
self
.
running
)
==
self
.
max_num_running_reqs
:
if
len
(
self
.
running
)
==
self
.
max_num_running_reqs
:
break
break
if
(
envs
.
VLLM_USE_PP_BALANCE
and
self
.
use_pp
and
len
(
scheduled_new_reqs
)
+
len
(
scheduled_resumed_reqs
)
+
len
(
scheduled_running_reqs
)
>=
max_batch_running
):
break
request
=
self
.
waiting
.
peek_request
()
request
=
self
.
waiting
.
peek_request
()
if
request
.
is_finished
():
if
request
.
is_finished
():
self
.
waiting
.
pop_request
()
self
.
waiting
.
pop_request
()
continue
continue
...
@@ -648,11 +661,18 @@ class Scheduler(SchedulerInterface):
...
@@ -648,11 +661,18 @@ class Scheduler(SchedulerInterface):
skipped_waiting_requests
=
create_request_queue
(
self
.
policy
)
skipped_waiting_requests
=
create_request_queue
(
self
.
policy
)
req_index
=
len
(
self
.
running
)
req_index
=
len
(
self
.
running
)
if
envs
.
VLLM_USE_PP_BALANCE
and
self
.
use_pp
:
pipeline_size
=
self
.
parallel_config
.
pipeline_parallel_size
max_batch_running
=
(
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
pipeline_size
-
1
)
//
pipeline_size
# First, schedule the WAITING requests.
# First, schedule the WAITING requests.
while
self
.
waiting
and
token_budget
>
0
:
while
self
.
waiting
and
token_budget
>
0
:
if
len
(
self
.
running
)
==
self
.
max_num_running_reqs
:
if
len
(
self
.
running
)
==
self
.
max_num_running_reqs
:
break
break
#TODO:考虑到decode过程中来新请求时,可以一次性处理所有请求的prefill 也许schedule the WAITING requests 中取消pp平衡效果更好
if
(
envs
.
VLLM_USE_PP_BALANCE
and
self
.
use_pp
and
len
(
scheduled_new_reqs
)
+
len
(
scheduled_resumed_reqs
)
+
len
(
scheduled_running_reqs
)
>=
max_batch_running
):
break
request
=
self
.
waiting
.
peek_request
()
request
=
self
.
waiting
.
peek_request
()
# KVTransfer: skip request if still waiting for remote kvs.
# KVTransfer: skip request if still waiting for remote kvs.
...
@@ -833,6 +853,10 @@ class Scheduler(SchedulerInterface):
...
@@ -833,6 +853,10 @@ class Scheduler(SchedulerInterface):
if
not
scheduled_new_reqs
and
not
scheduled_resumed_reqs
:
if
not
scheduled_new_reqs
and
not
scheduled_resumed_reqs
:
req_index
=
0
req_index
=
0
while
req_index
<
len
(
self
.
running
)
and
token_budget
>
0
:
while
req_index
<
len
(
self
.
running
)
and
token_budget
>
0
:
if
(
envs
.
VLLM_USE_PP_BALANCE
and
self
.
use_pp
and
len
(
scheduled_new_reqs
)
+
len
(
scheduled_resumed_reqs
)
+
len
(
scheduled_running_reqs
)
>=
max_batch_running
):
break
request
=
self
.
running
[
req_index
]
request
=
self
.
running
[
req_index
]
num_new_tokens
=
(
request
.
num_tokens_with_spec
-
num_new_tokens
=
(
request
.
num_tokens_with_spec
-
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment