Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9076ef2b
"vscode:/vscode.git/clone" did not exist on "85eb6318391fc25d708af6109480498b45f56c77"
Commit
9076ef2b
authored
Apr 15, 2025
by
lizhigong
Browse files
add VLLM_ZERO_DISABLE_AUTO_THREAD VLLM_ZERO_NO_THREAD change zero scheduling logic
parent
4ff58b66
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
120 additions
and
9 deletions
+120
-9
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+2
-2
vllm/zero_overhead/v0/llm_engine.py
vllm/zero_overhead/v0/llm_engine.py
+109
-6
vllm/zero_overhead/v0/utils.py
vllm/zero_overhead/v0/utils.py
+9
-1
No files found.
vllm/entrypoints/llm.py
View file @
9076ef2b
...
...
@@ -44,7 +44,7 @@ from vllm.usage.usage_lib import UsageContext
from
vllm.utils
import
(
Counter
,
Device
,
deprecate_args
,
deprecate_kwargs
,
is_list_of
)
from
vllm.zero_overhead.v0.llm_engine
import
ZeroOverheadEngine
from
vllm.zero_overhead.v0.utils
import
is_zero_overhead
from
vllm.zero_overhead.v0.utils
import
is_zero_auto_thread
,
is_zero_overhead
logger
=
init_logger
(
__name__
)
...
...
@@ -1450,7 +1450,7 @@ class LLM:
if
use_tqdm
:
pbar
.
close
()
if
is_zero_
overh
ead
():
if
is_zero_
auto_thr
ead
():
self
.
llm_engine
.
finish_thread
()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
...
...
vllm/zero_overhead/v0/llm_engine.py
View file @
9076ef2b
...
...
@@ -40,6 +40,7 @@ from vllm.zero_overhead.v0.tokenizer import ZeroOverheadDetokenizer
from
vllm.usage.usage_lib
import
(
UsageContext
,
is_usage_stats_enabled
,
usage_message
)
from
vllm.profiler.prof
import
profile
from
vllm.zero_overhead.v0.utils
import
is_zero_no_thread
logger
=
init_logger
(
__name__
)
...
...
@@ -264,11 +265,13 @@ class ZeroOverheadEngine(LLMEngine):
self
.
last_record
=
None
assert
os
.
environ
.
get
(
'HIP_ALLOC_INITIALIZE'
)
==
'0'
self
.
async_event
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
self
.
zero_
thread
=
threading
.
Thread
(
target
=
self
.
thread_zero_overhead
)
self
.
thread
_running
=
False
self
.
q_recorder
=
queue
.
Queue
()
self
.
thread_running
=
True
self
.
sem_m2s
=
threading
.
Semaphore
(
0
)
# main to scheduler thread
self
.
zero_thread
.
start
()
if
not
is_zero_no_thread
():
self
.
zero_thread
=
threading
.
Thread
(
target
=
self
.
thread_zero_overhead
)
self
.
thread_running
=
True
self
.
sem_m2s
=
threading
.
Semaphore
(
0
)
# main to scheduler thread
self
.
zero_thread
.
start
()
profile
.
StartTracer
()
def
__del__
(
self
):
...
...
@@ -426,11 +429,111 @@ class ZeroOverheadEngine(LLMEngine):
seq
.
fix_last_token_id
(
sample
.
output_token
)
break
def
no_thread_step
(
self
):
virtual_engine
=
0
# Clear outputs for each new scheduler iteration
# Schedule iteration
(
seq_group_metadata_list
,
scheduler_outputs
,
allow_async_output_proc
)
=
self
.
scheduler
[
virtual_engine
].
schedule
()
if
self
.
last_record
is
not
None
:
last_sampler
=
self
.
last_record
[
1
]
self
.
async_d2h
=
last_sampler
.
sampled_token_ids_tensor
.
to
(
'cpu'
,
non_blocking
=
True
)
self
.
async_event
.
record
()
self
.
q_recorder
.
put
(
self
.
last_record
)
else
:
self
.
q_recorder
.
put
(
None
)
if
len
(
seq_group_metadata_list
)
==
0
:
self
.
last_record
=
None
else
:
finished_requests_ids
=
self
.
scheduler
[
virtual_engine
].
get_and_reset_finished_requests_ids
()
assert
seq_group_metadata_list
is
not
None
assert
scheduler_outputs
is
not
None
last_sampled_token_ids
=
\
self
.
_get_last_sampled_token_ids
(
virtual_engine
)
execute_model_req
=
ExecuteModelRequest
(
seq_group_metadata_list
=
seq_group_metadata_list
,
blocks_to_swap_in
=
scheduler_outputs
.
blocks_to_swap_in
,
blocks_to_swap_out
=
scheduler_outputs
.
blocks_to_swap_out
,
blocks_to_copy
=
scheduler_outputs
.
blocks_to_copy
,
num_lookahead_slots
=
scheduler_outputs
.
num_lookahead_slots
,
running_queue_size
=
scheduler_outputs
.
running_queue_size
,
finished_requests_ids
=
finished_requests_ids
,
# We use ExecuteModelRequest to pass the last sampled_token_ids
# to each of the non-last PP stages for in-place prepare_input.
last_sampled_token_ids
=
last_sampled_token_ids
)
outputs
=
self
.
model_executor
.
execute_model
(
execute_model_req
=
execute_model_req
)
if
len
(
outputs
)
==
1
:
self
.
_advance_to_next_step
(
outputs
[
0
],
seq_group_metadata_list
,
scheduler_outputs
.
scheduled_seq_groups
)
scheduler_outputs
.
scheduled_seq_groups
=
[
item
for
item
in
scheduler_outputs
.
scheduled_seq_groups
]
#deep copy
last_sampler
=
get_last_sampler
()
self
.
last_record
=
[
outputs
,
last_sampler
,
seq_group_metadata_list
,
scheduler_outputs
]
recode_output
=
self
.
q_recorder
.
get
()
if
recode_output
is
None
:
# None is for the first step
return
None
virtual_engine
=
0
ctx
=
self
.
scheduler_contexts
[
virtual_engine
]
ctx
.
request_outputs
.
clear
()
outputs
,
last_sampler
,
seq_group_metadata_list
,
scheduler_outputs
=
recode_output
ctx
.
seq_group_metadata_list
=
seq_group_metadata_list
ctx
.
scheduler_outputs
=
scheduler_outputs
self
.
async_event
.
synchronize
()
self
.
_fix_last_step
(
outputs
,
last_sampler
,
seq_group_metadata_list
,
scheduler_outputs
.
scheduled_seq_groups
)
# is_first_step_output is True only when the num_steps of all
# the sequences are 1. When the num_steps > 1,
# multi_step_model_runner does the first-step output append.
is_first_step_output
:
bool
=
False
if
not
seq_group_metadata_list
\
else
seq_group_metadata_list
[
0
].
state
.
num_steps
==
1
# Add results to the output_queue
ctx
.
append_output
(
outputs
=
outputs
,
seq_group_metadata_list
=
seq_group_metadata_list
,
scheduler_outputs
=
scheduler_outputs
,
is_async
=
True
,
is_last_step
=
True
,
is_first_step_output
=
is_first_step_output
)
# Check if need to run the usual non-async path
#if not allow_async_output_proc:
self
.
_process_model_outputs
(
ctx
=
ctx
)
#profile.ProfRangeAutoPush('has_unfinish')
if
not
self
.
has_unfinished_requests
():
# Drain async postprocessor (if exists)
if
len
(
ctx
.
output_queue
)
>
0
:
self
.
_process_model_outputs
(
ctx
=
ctx
)
assert
len
(
ctx
.
output_queue
)
==
0
# Stop the execute model loop in parallel workers until there are
# more requests to process. This avoids waiting indefinitely in
# torch.distributed ops which may otherwise timeout, and unblocks
# the RPC thread in the workers so that they can process any other
# queued control plane messages, such as add/remove lora adapters.
logger
.
debug
(
"Stopping remote worker execution loop."
)
self
.
model_executor
.
stop_remote_worker_execution_loop
()
return
ctx
.
request_outputs
def
step
(
self
)
->
List
[
Union
[
RequestOutput
,
PoolingRequestOutput
]]:
out
=
self
.
zero_overhead_step
()
if
out
is
None
:
#the first step need launch twice
if
is_zero_no_thread
():
out
=
self
.
no_thread_step
()
if
out
is
None
:
#the first step need launch twice
out
=
self
.
no_thread_step
()
else
:
out
=
self
.
zero_overhead_step
()
if
out
is
None
:
#the first step need launch twice
out
=
self
.
zero_overhead_step
()
return
out
def
_add_processed_request
(
...
...
vllm/zero_overhead/v0/utils.py
View file @
9076ef2b
...
...
@@ -3,6 +3,14 @@
import
os
zero_overhead
=
os
.
environ
.
get
(
'VLLM_ZERO_OVERHEAD'
)
==
'1'
disable_auto_finish_thread
=
os
.
environ
.
get
(
'VLLM_ZERO_DISABLE_AUTO_THREAD'
)
==
'1'
zero_no_thread
=
os
.
environ
.
get
(
'VLLM_ZERO_NO_THREAD'
)
==
'1'
def
is_zero_overhead
():
return
zero_overhead
\ No newline at end of file
return
zero_overhead
def
is_zero_auto_thread
():
return
(
not
disable_auto_finish_thread
)
and
zero_overhead
and
(
not
zero_no_thread
)
def
is_zero_no_thread
():
return
zero_no_thread
and
zero_overhead
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment