Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
70b81c4f
Unverified
Commit
70b81c4f
authored
Mar 19, 2026
by
youkaichao
Committed by
GitHub
Mar 18, 2026
Browse files
[bugfix][async scheduling] fix extra cuda context in device 0 with EP/DP (#37449)
Signed-off-by:
youkaichao
<
youkaichao@gmail.com
>
parent
7476d148
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
11 deletions
+23
-11
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+23
-11
No files found.
vllm/v1/executor/multiproc_executor.py
View file @
70b81c4f
...
@@ -597,17 +597,6 @@ class WorkerProc:
...
@@ -597,17 +597,6 @@ class WorkerProc:
wrapper
.
init_worker
(
all_kwargs
)
wrapper
.
init_worker
(
all_kwargs
)
self
.
worker
=
wrapper
self
.
worker
=
wrapper
scheduler_config
=
vllm_config
.
scheduler_config
self
.
use_async_scheduling
=
scheduler_config
.
async_scheduling
if
self
.
use_async_scheduling
:
self
.
async_output_queue
:
queue
.
Queue
=
queue
.
Queue
()
self
.
async_output_copy_thread
=
Thread
(
target
=
self
.
async_output_busy_loop
,
daemon
=
True
,
name
=
"WorkerAsyncOutputCopy"
,
)
self
.
async_output_copy_thread
.
start
()
self
.
setup_proc_title_and_log_prefix
(
self
.
setup_proc_title_and_log_prefix
(
enable_ep
=
vllm_config
.
parallel_config
.
enable_expert_parallel
enable_ep
=
vllm_config
.
parallel_config
.
enable_expert_parallel
)
)
...
@@ -622,6 +611,17 @@ class WorkerProc:
...
@@ -622,6 +611,17 @@ class WorkerProc:
)
)
self
.
worker
.
load_model
()
self
.
worker
.
load_model
()
scheduler_config
=
vllm_config
.
scheduler_config
self
.
use_async_scheduling
=
scheduler_config
.
async_scheduling
if
self
.
use_async_scheduling
:
self
.
async_output_queue
:
queue
.
Queue
=
queue
.
Queue
()
self
.
async_output_copy_thread
=
Thread
(
target
=
self
.
async_output_busy_loop
,
daemon
=
True
,
name
=
"WorkerAsyncOutputCopy"
,
)
self
.
async_output_copy_thread
.
start
()
# Set block size based on the attention backends
# Set block size based on the attention backends
current_platform
.
update_block_size_for_backend
(
vllm_config
)
current_platform
.
update_block_size_for_backend
(
vllm_config
)
...
@@ -911,6 +911,18 @@ class WorkerProc:
...
@@ -911,6 +911,18 @@ class WorkerProc:
def
async_output_busy_loop
(
self
):
def
async_output_busy_loop
(
self
):
"""Entrypoint for the thread which handles outputs asynchronously."""
"""Entrypoint for the thread which handles outputs asynchronously."""
# set device to the worker device for the thread.
# a thread will not inherit the context of the main thread.
# when calling any cuda runtime functions, it will implicitly
# create a new cuda context on device 0, consuming extra memory.
# here we set the device to the worker device for the thread,
# enforcing the context to be the same as the main thread.
from
vllm.platforms
import
current_platform
if
hasattr
(
self
.
worker
,
"device"
):
current_platform
.
set_device
(
self
.
worker
.
device
)
while
True
:
while
True
:
output
=
self
.
async_output_queue
.
get
()
output
=
self
.
async_output_queue
.
get
()
self
.
enqueue_output
(
output
)
self
.
enqueue_output
(
output
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment