Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d41faaf9
Unverified
Commit
d41faaf9
authored
Apr 21, 2025
by
Han Zhang
Committed by
GitHub
Apr 21, 2025
Browse files
Restore buffers when wake up from level 2 sleep (#16564) (#16889)
Signed-off-by:
Han
<
zh950713@gmail.com
>
parent
b34f3343
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
40 additions
and
0 deletions
+40
-0
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+20
-0
vllm/worker/worker.py
vllm/worker/worker.py
+20
-0
No files found.
vllm/v1/worker/gpu_worker.py
View file @
d41faaf9
...
@@ -54,6 +54,9 @@ class Worker(WorkerBase):
...
@@ -54,6 +54,9 @@ class Worker(WorkerBase):
from
vllm.utils
import
init_cached_hf_modules
from
vllm.utils
import
init_cached_hf_modules
init_cached_hf_modules
()
init_cached_hf_modules
()
# Buffers saved before sleep
self
.
_sleep_saved_buffers
:
dict
[
str
,
torch
.
Tensor
]
=
{}
# Torch profiler. Enabled and configured through env vars:
# Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
...
@@ -73,6 +76,15 @@ class Worker(WorkerBase):
...
@@ -73,6 +76,15 @@ class Worker(WorkerBase):
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
free_bytes_before_sleep
=
torch
.
cuda
.
mem_get_info
()[
0
]
free_bytes_before_sleep
=
torch
.
cuda
.
mem_get_info
()[
0
]
# Save the buffers before level 2 sleep
if
level
==
2
:
model
=
self
.
model_runner
.
model
self
.
_sleep_saved_buffers
=
{
name
:
buffer
.
cpu
().
clone
()
for
name
,
buffer
in
model
.
named_buffers
()
}
allocator
=
CuMemAllocator
.
get_instance
()
allocator
=
CuMemAllocator
.
get_instance
()
allocator
.
sleep
(
offload_tags
=
(
"weights"
,
)
if
level
==
1
else
tuple
())
allocator
.
sleep
(
offload_tags
=
(
"weights"
,
)
if
level
==
1
else
tuple
())
free_bytes_after_sleep
,
total
=
torch
.
cuda
.
mem_get_info
()
free_bytes_after_sleep
,
total
=
torch
.
cuda
.
mem_get_info
()
...
@@ -88,6 +100,14 @@ class Worker(WorkerBase):
...
@@ -88,6 +100,14 @@ class Worker(WorkerBase):
allocator
=
CuMemAllocator
.
get_instance
()
allocator
=
CuMemAllocator
.
get_instance
()
allocator
.
wake_up
(
tags
)
allocator
.
wake_up
(
tags
)
# Restore the buffers after level 2 sleep
if
len
(
self
.
_sleep_saved_buffers
):
model
=
self
.
model_runner
.
model
for
name
,
buffer
in
model
.
named_buffers
():
if
name
in
self
.
_sleep_saved_buffers
:
buffer
.
data
.
copy_
(
self
.
_sleep_saved_buffers
[
name
].
data
)
self
.
_sleep_saved_buffers
=
{}
def
init_device
(
self
):
def
init_device
(
self
):
if
self
.
device_config
.
device
.
type
==
"cuda"
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
# torch.distributed.all_reduce does not free the input tensor until
# torch.distributed.all_reduce does not free the input tensor until
...
...
vllm/worker/worker.py
View file @
d41faaf9
...
@@ -95,6 +95,9 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -95,6 +95,9 @@ class Worker(LocalOrDistributedWorkerBase):
self
.
gpu_cache
:
Optional
[
List
[
List
[
torch
.
Tensor
]]]
=
None
self
.
gpu_cache
:
Optional
[
List
[
List
[
torch
.
Tensor
]]]
=
None
self
.
_seq_group_metadata_cache
:
Dict
[
str
,
SequenceGroupMetadata
]
=
{}
self
.
_seq_group_metadata_cache
:
Dict
[
str
,
SequenceGroupMetadata
]
=
{}
# Buffers saved before sleep
self
.
_sleep_saved_buffers
:
Dict
[
str
,
torch
.
Tensor
]
=
{}
# Torch profiler. Enabled and configured through env vars:
# Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
...
@@ -124,6 +127,15 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -124,6 +127,15 @@ class Worker(LocalOrDistributedWorkerBase):
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
free_bytes_before_sleep
=
torch
.
cuda
.
mem_get_info
()[
0
]
free_bytes_before_sleep
=
torch
.
cuda
.
mem_get_info
()[
0
]
# Save the buffers before level 2 sleep
if
level
==
2
:
model
=
self
.
model_runner
.
model
self
.
_sleep_saved_buffers
=
{
name
:
buffer
.
cpu
().
clone
()
for
name
,
buffer
in
model
.
named_buffers
()
}
allocator
=
CuMemAllocator
.
get_instance
()
allocator
=
CuMemAllocator
.
get_instance
()
allocator
.
sleep
(
offload_tags
=
(
"weights"
,
)
if
level
==
1
else
tuple
())
allocator
.
sleep
(
offload_tags
=
(
"weights"
,
)
if
level
==
1
else
tuple
())
free_bytes_after_sleep
,
total
=
torch
.
cuda
.
mem_get_info
()
free_bytes_after_sleep
,
total
=
torch
.
cuda
.
mem_get_info
()
...
@@ -139,6 +151,14 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -139,6 +151,14 @@ class Worker(LocalOrDistributedWorkerBase):
allocator
=
CuMemAllocator
.
get_instance
()
allocator
=
CuMemAllocator
.
get_instance
()
allocator
.
wake_up
(
tags
=
tags
)
allocator
.
wake_up
(
tags
=
tags
)
# Restore the buffers after level 2 sleep
if
len
(
self
.
_sleep_saved_buffers
):
model
=
self
.
model_runner
.
model
for
name
,
buffer
in
model
.
named_buffers
():
if
name
in
self
.
_sleep_saved_buffers
:
buffer
.
data
.
copy_
(
self
.
_sleep_saved_buffers
[
name
].
data
)
self
.
_sleep_saved_buffers
=
{}
def
init_device
(
self
)
->
None
:
def
init_device
(
self
)
->
None
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
# torch.distributed.all_reduce does not free the input tensor until
# torch.distributed.all_reduce does not free the input tensor until
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment