Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
59488cc9
Commit
59488cc9
authored
Jun 06, 2025
by
lizhigong
Browse files
fix 修改tbo的线程管理和线程释放方式,减少对其他模块的影响
parent
bd363067
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
43 additions
and
72 deletions
+43
-72
vllm/entrypoints/launcher.py
vllm/entrypoints/launcher.py
+0
-2
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+0
-3
vllm/executor/multiproc_worker_utils.py
vllm/executor/multiproc_worker_utils.py
+0
-4
vllm/two_batch_overlap/two_batch_overlap.py
vllm/two_batch_overlap/two_batch_overlap.py
+43
-61
vllm/worker/worker_base.py
vllm/worker/worker_base.py
+0
-2
No files found.
vllm/entrypoints/launcher.py
View file @
59488cc9
...
...
@@ -78,8 +78,6 @@ async def serve_http(app: FastAPI,
port
,
process
,
" "
.
join
(
process
.
cmdline
()))
logger
.
info
(
"Shutting down FastAPI HTTP server."
)
from
vllm.two_batch_overlap.two_batch_overlap
import
finish_two_batch_overlap
finish_two_batch_overlap
()
return
server
.
shutdown
()
finally
:
watchdog_task
.
cancel
()
...
...
vllm/executor/executor_base.py
View file @
59488cc9
...
...
@@ -16,7 +16,6 @@ from vllm.lora.request import LoRARequest
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
ExecuteModelRequest
,
PoolerOutput
from
vllm.two_batch_overlap.two_batch_overlap
import
finish_two_batch_overlap
from
vllm.utils
import
make_async
from
vllm.worker.worker_base
import
WorkerBase
...
...
@@ -144,7 +143,6 @@ class ExecutorBase(ABC):
def
stop_remote_worker_execution_loop
(
self
)
->
None
:
"""Releases parallel workers from model loop."""
finish_two_batch_overlap
()
return
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
...
...
@@ -303,7 +301,6 @@ class DistributedExecutorBase(ExecutorBase):
return
driver_outputs
def
stop_remote_worker_execution_loop
(
self
)
->
None
:
finish_two_batch_overlap
()
if
self
.
parallel_worker_tasks
is
None
:
return
...
...
vllm/executor/multiproc_worker_utils.py
View file @
59488cc9
...
...
@@ -256,10 +256,6 @@ def _run_worker_process(
and
not
tunable
.
record_untuned_is_enabled
()):
tunable
.
write_file
()
from
vllm.two_batch_overlap.two_batch_overlap
import
finish_two_batch_overlap
finish_two_batch_overlap
()
logger
.
info
(
"Worker exiting"
)
...
...
vllm/two_batch_overlap/two_batch_overlap.py
View file @
59488cc9
...
...
@@ -53,24 +53,17 @@ class TwoBatchOverlap():
def
init_tbo_thread
(
self
):
self
.
model_input_left_queue
.
empty
()
self
.
model_input_right_queue
.
empty
()
if
self
.
left_thread
==
None
:
self
.
left_thread
=
threading
.
Thread
(
target
=
self
.
thread_two_batch_overlap
,
args
=
(
self
.
model_input_left_queue
,))
self
.
left_thread
.
start
()
if
self
.
right_thread
==
None
:
self
.
right_thread
=
threading
.
Thread
(
target
=
self
.
thread_two_batch_overlap
,
args
=
(
self
.
model_input_right_queue
,))
self
.
right_thread
.
start
()
logger
.
info
(
'tbo:two batch overlap threads start'
)
self
.
left_thread
=
threading
.
Thread
(
target
=
self
.
thread_two_batch_overlap
,
args
=
(
self
.
model_input_left_queue
,))
self
.
left_thread
.
start
()
self
.
right_thread
=
threading
.
Thread
(
target
=
self
.
thread_two_batch_overlap
,
args
=
(
self
.
model_input_right_queue
,))
self
.
right_thread
.
start
()
logger
.
info
(
'tbo:two batch overlap start'
)
def
finish_thread
(
self
):
if
self
.
left_thread
!=
None
:
self
.
model_input_left_queue
.
put
(
None
)
self
.
left_thread
.
join
()
self
.
left_thread
=
None
if
self
.
right_thread
!=
None
:
self
.
model_input_right_queue
.
put
(
None
)
self
.
right_thread
.
join
()
self
.
right_thread
=
None
logger
.
info
(
'tbo:finish threads'
)
self
.
left_thread
.
join
()
self
.
left_thread
=
None
self
.
right_thread
.
join
()
self
.
right_thread
=
None
@
torch
.
inference_mode
()
def
thread_two_batch_overlap
(
self
,
queue
):
...
...
@@ -84,48 +77,44 @@ class TwoBatchOverlap():
self
.
right_tid
=
tid
init_tbo_forward_context
(
False
,
self
.
right_tid
)
with
torch
.
cuda
.
stream
(
tbo_step_stream
):
while
True
:
model_input
=
queue
.
get
()
if
model_input
==
None
:
break
profile
.
ProfRangePush
(
'start'
)
self
.
tbo_thread_synchronize
(
tid
)
model_kwargs
=
None
intermediate_tensors
=
None
if
is_left_thread
:
model_kwargs
=
self
.
model_kwargs_left
intermediate_tensors
=
self
.
intermediate_tensors_left
else
:
model_kwargs
=
self
.
model_kwargs_right
intermediate_tensors
=
self
.
intermediate_tensors_right
with
set_forward_context
(
model_input
.
attn_metadata
,
self
.
vllm_config
,
self
.
virtual_engine
):
hidden_or_intermediate_states
=
self
.
model_executable
(
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
self
.
multi_modal_kwargs
,
device
=
self
.
self_device
),
**
self
.
seqlen_agnostic_kwargs
,
**
model_kwargs
,
)
if
is_left_thread
:
self
.
sem_right
.
release
()
self
.
states_left_queue
.
put
(
hidden_or_intermediate_states
)
else
:
self
.
all_reduce_queue
.
put
(
None
)
self
.
states_right_queue
.
put
(
hidden_or_intermediate_states
)
profile
.
ProfRangePop
()
model_input
=
queue
.
get
()
profile
.
ProfRangePush
(
'start'
)
self
.
tbo_thread_synchronize
(
tid
)
model_kwargs
=
None
intermediate_tensors
=
None
if
is_left_thread
:
model_kwargs
=
self
.
model_kwargs_left
intermediate_tensors
=
self
.
intermediate_tensors_left
else
:
model_kwargs
=
self
.
model_kwargs_right
intermediate_tensors
=
self
.
intermediate_tensors_right
with
set_forward_context
(
model_input
.
attn_metadata
,
self
.
vllm_config
,
self
.
virtual_engine
):
hidden_or_intermediate_states
=
self
.
model_executable
(
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
self
.
multi_modal_kwargs
,
device
=
self
.
self_device
),
**
self
.
seqlen_agnostic_kwargs
,
**
model_kwargs
,
)
if
is_left_thread
:
self
.
sem_right
.
release
()
self
.
states_left_queue
.
put
(
hidden_or_intermediate_states
)
else
:
self
.
all_reduce_queue
.
put
(
None
)
self
.
states_right_queue
.
put
(
hidden_or_intermediate_states
)
profile
.
ProfRangePop
()
def
tbo_thread_synchronize
(
self
,
tid
):
if
tid
==
self
.
left_tid
:
if
not
self
.
left_first
:
self
.
sem_right
.
release
()
self
.
left_first
=
False
profile
.
ProfRangePop
()
self
.
sem_left
.
acquire
()
profile
.
ProfRangePush
(
'left'
)
self
.
left_first
=
False
return
self
.
event_left_c2t
,
self
.
event_left_t2c
else
:
self
.
sem_left
.
release
()
...
...
@@ -147,8 +136,6 @@ class TwoBatchOverlap():
seqlen_agnostic_kwargs
,
model_kwargs_left
,
model_kwargs_right
):
if
self
.
left_thread
==
None
:
self
.
init_tbo_thread
()
self
.
vllm_config
=
vllm_config
self
.
virtual_engine
=
virtual_engine
self
.
model_executable
=
model_executable
...
...
@@ -186,16 +173,10 @@ class TwoBatchOverlap():
tbo_obj
=
None
def
init_two_batch_overlap
():
if
envs
.
VLLM_ENABLE_TBO
:
global
tbo_obj
if
tbo_obj
==
None
:
tbo_obj
=
TwoBatchOverlap
()
def
finish_two_batch_overlap
():
global
tbo_obj
if
tbo_obj
!
=
None
:
tbo_obj
.
finish_thread
()
tbo_obj
=
None
if
tbo_obj
=
=
None
:
tbo_obj
=
TwoBatchOverlap
()
tbo_obj
.
init_tbo_thread
()
def
tbo_all_reduce
(
obj
):
if
envs
.
VLLM_ENABLE_TBO
and
tbo_obj
!=
None
and
tbo_obj
.
tbo_running
:
...
...
@@ -309,6 +290,7 @@ def tbo_model_executable(
hidden_or_intermediate_states
=
merge_model_output
(
states_left
,
states_right
)
tbo_obj
.
tbo_running
=
False
tbo_obj
.
step_event
.
record
()
tbo_obj
.
finish_thread
()
current_stream
.
wait_event
(
tbo_obj
.
step_event
)
profile
.
ProfRangePop
()
return
hidden_or_intermediate_states
vllm/worker/worker_base.py
View file @
59488cc9
...
...
@@ -18,7 +18,6 @@ from vllm.logger import init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
IntermediateTensors
from
vllm.two_batch_overlap.two_batch_overlap
import
finish_two_batch_overlap
from
vllm.utils
import
(
enable_trace_function_call_for_thread
,
resolve_obj_by_qualname
,
run_method
,
update_environment_variables
,
...
...
@@ -113,7 +112,6 @@ class WorkerBase:
while
True
:
output
=
self
.
execute_model
(
execute_model_req
=
None
)
if
output
is
None
:
finish_two_batch_overlap
()
return
None
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment