Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
76ec56bd
Commit
76ec56bd
authored
Feb 06, 2026
by
jujl1
Browse files
feat: pp balance
parent
b8f555af
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
7 deletions
+22
-7
vllm/envs.py
vllm/envs.py
+6
-2
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+16
-5
No files found.
vllm/envs.py
View file @
76ec56bd
...
@@ -291,6 +291,7 @@ if TYPE_CHECKING:
...
@@ -291,6 +291,7 @@ if TYPE_CHECKING:
VLLM_USE_FUSED_RMS_ROPE
:
bool
=
False
VLLM_USE_FUSED_RMS_ROPE
:
bool
=
False
VLLM_USE_FUSED_FILL_RMS_CAT
:
bool
=
False
VLLM_USE_FUSED_FILL_RMS_CAT
:
bool
=
False
VLLM_W8A8_BACKEND
:
int
=
3
VLLM_W8A8_BACKEND
:
int
=
3
VLLM_USE_PP_BALANCE
=
True
VLLM_REJECT_SAMPLE_OPT
:
bool
=
False
VLLM_REJECT_SAMPLE_OPT
:
bool
=
False
...
@@ -1831,6 +1832,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1831,6 +1832,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_FUSED_FILL_RMS_CAT"
:
"VLLM_USE_FUSED_FILL_RMS_CAT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_FUSED_FILL_RMS_CAT"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_FUSED_FILL_RMS_CAT"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
"VLLM_USE_PP_BALANCE"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PP_BALANCE"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# W8A8 GEMM backend selection for vLLM quantized models.
# W8A8 GEMM backend selection for vLLM quantized models.
# lightop/triton: 1
# lightop/triton: 1
# cutlass: 2 (will remove in the future)
# cutlass: 2 (will remove in the future)
...
...
vllm/v1/core/sched/scheduler.py
View file @
76ec56bd
...
@@ -343,7 +343,10 @@ class Scheduler(SchedulerInterface):
...
@@ -343,7 +343,10 @@ class Scheduler(SchedulerInterface):
# For logging.
# For logging.
scheduled_timestamp
=
time
.
monotonic
()
scheduled_timestamp
=
time
.
monotonic
()
if
self
.
use_pp
and
envs
.
VLLM_USE_PP_BALANCE
:
pipeline_size
=
self
.
parallel_config
.
pipeline_parallel_size
max_batch_running
=
(
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
pipeline_size
-
1
)
//
pipeline_size
# First, schedule the RUNNING requests.
# First, schedule the RUNNING requests.
req_index
=
0
req_index
=
0
while
req_index
<
len
(
self
.
running
)
and
token_budget
>
0
:
while
req_index
<
len
(
self
.
running
)
and
token_budget
>
0
:
...
@@ -352,7 +355,12 @@ class Scheduler(SchedulerInterface):
...
@@ -352,7 +355,12 @@ class Scheduler(SchedulerInterface):
# do not schedule another step for the same request while it still has
# do not schedule another step for the same request while it still has
# output placeholders for PP.
# output placeholders for PP.
# TODO: support PP + async scheduling without this limit
# TODO: support PP + async scheduling without this limit
if
self
.
use_pp
and
request
.
num_output_placeholders
>
0
:
if
self
.
use_pp
:
if
(
envs
.
VLLM_USE_PP_BALANCE
and
len
(
scheduled_new_reqs
)
+
len
(
scheduled_resumed_reqs
)
+
len
(
scheduled_running_reqs
)
>=
max_batch_running
):
break
if
request
.
num_output_placeholders
>
0
:
req_index
+=
1
req_index
+=
1
continue
continue
...
@@ -543,7 +551,10 @@ class Scheduler(SchedulerInterface):
...
@@ -543,7 +551,10 @@ class Scheduler(SchedulerInterface):
while
self
.
waiting
and
token_budget
>
0
:
while
self
.
waiting
and
token_budget
>
0
:
if
len
(
self
.
running
)
==
self
.
max_num_running_reqs
:
if
len
(
self
.
running
)
==
self
.
max_num_running_reqs
:
break
break
if
(
self
.
use_pp
and
envs
.
VLLM_USE_PP_BALANCE
and
len
(
scheduled_new_reqs
)
+
len
(
scheduled_resumed_reqs
)
+
len
(
scheduled_running_reqs
)
>=
max_batch_running
):
break
request
=
self
.
waiting
.
peek_request
()
request
=
self
.
waiting
.
peek_request
()
# KVTransfer: skip request if still waiting for remote kvs.
# KVTransfer: skip request if still waiting for remote kvs.
if
request
.
status
==
RequestStatus
.
WAITING_FOR_REMOTE_KVS
:
if
request
.
status
==
RequestStatus
.
WAITING_FOR_REMOTE_KVS
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment