Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
08c4bafa
Commit
08c4bafa
authored
Mar 26, 2025
by
lizhigong
Browse files
add auto finish and restart thread when finish generate and start new generate
parent
ca4ec0ce
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
2 deletions
+14
-2
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+12
-2
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+2
-0
No files found.
vllm/engine/llm_engine.py
View file @
08c4bafa
...
...
@@ -413,14 +413,13 @@ class LLMEngine:
self
.
zero_overhead
=
os
.
environ
.
get
(
'VLLM_ZERO_OVERHEAD'
)
==
'1'
if
self
.
zero_overhead
:
# self.step_switch = 0 # 0 step A 1 step B
# self.output_recorder = [None, None]
self
.
async_d2h
=
None
self
.
last_record
=
None
self
.
async_event
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
self
.
zero_thread
=
threading
.
Thread
(
target
=
self
.
thread_zero_overhead
)
self
.
q_recorder
=
queue
.
Queue
()
self
.
q_recorder
.
put
(
None
)
# None is use for first step ignore
self
.
thread_running
=
True
self
.
sem_m2s
=
threading
.
Semaphore
(
0
)
# main to scheduler thread
self
.
zero_thread
.
start
()
profile
.
StartTracer
()
...
...
@@ -1317,9 +1316,16 @@ class LLMEngine:
def
trans_last_output_tensor
(
self
,
last_output
)
->
torch
.
Tensor
:
return
None
def
finish_thread
(
self
):
if
self
.
zero_overhead
:
self
.
thread_running
=
False
self
.
sem_m2s
.
release
()
def
thread_zero_overhead
(
self
):
while
True
:
self
.
sem_m2s
.
acquire
()
if
not
self
.
thread_running
:
break
virtual_engine
=
0
ctx
=
self
.
scheduler_contexts
[
virtual_engine
]
...
...
@@ -1377,6 +1383,10 @@ class LLMEngine:
self
.
last_record
=
[
outputs
,
seq_group_metadata_list
,
scheduler_outputs
]
def
zero_overhead_step
(
self
)
->
List
[
Union
[
RequestOutput
,
PoolingRequestOutput
]]:
if
not
self
.
thread_running
:
self
.
zero_thread
=
threading
.
Thread
(
target
=
self
.
thread_zero_overhead
)
self
.
thread_running
=
True
self
.
zero_thread
.
start
()
self
.
sem_m2s
.
release
()
recode_output
=
self
.
q_recorder
.
get
()
if
recode_output
is
None
:
# None is for the first step
...
...
vllm/entrypoints/llm.py
View file @
08c4bafa
...
...
@@ -1410,6 +1410,8 @@ class LLM:
if
use_tqdm
:
pbar
.
close
()
self
.
llm_engine
.
finish_thread
()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment