Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
30bad5c4
Unverified
Commit
30bad5c4
authored
Dec 12, 2023
by
Woosuk Kwon
Committed by
GitHub
Dec 12, 2023
Browse files
Fix peak memory profiling (#2031)
parent
3fefe271
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
9 deletions
+3
-9
vllm/utils.py
vllm/utils.py
+0
-5
vllm/worker/worker.py
vllm/worker/worker.py
+3
-4
No files found.
vllm/utils.py
View file @
30bad5c4
...
@@ -40,11 +40,6 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
...
@@ -40,11 +40,6 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
return
int
(
max_shared_mem
)
return
int
(
max_shared_mem
)
def
get_gpu_memory
(
gpu
:
int
=
0
)
->
int
:
"""Returns the total memory of the GPU in bytes."""
return
torch
.
cuda
.
get_device_properties
(
gpu
).
total_memory
def
get_cpu_memory
()
->
int
:
def
get_cpu_memory
()
->
int
:
"""Returns the total CPU memory of the node in bytes."""
"""Returns the total CPU memory of the node in bytes."""
return
psutil
.
virtual_memory
().
total
return
psutil
.
virtual_memory
().
total
...
...
vllm/worker/worker.py
View file @
30bad5c4
...
@@ -13,7 +13,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
...
@@ -13,7 +13,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.utils
import
get_gpu_memory
class
Worker
:
class
Worker
:
...
@@ -81,7 +80,6 @@ class Worker:
...
@@ -81,7 +80,6 @@ class Worker:
# Profile the memory usage of the model and get the maximum number of
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
# cache blocks that can be allocated with the remaining free memory.
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
reset_peak_memory_stats
()
# Execute a forward pass with dummy inputs to profile the memory usage
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
# of the model.
...
@@ -90,8 +88,9 @@ class Worker:
...
@@ -90,8 +88,9 @@ class Worker:
# Calculate the number of blocks that can be allocated with the
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
# profiled peak memory.
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
peak_memory
=
torch
.
cuda
.
max_memory_allocated
()
free_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
total_gpu_memory
=
get_gpu_memory
()
peak_memory
=
total_gpu_memory
-
free_gpu_memory
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
block_size
,
self
.
model_config
,
self
.
parallel_config
)
block_size
,
self
.
model_config
,
self
.
parallel_config
)
num_gpu_blocks
=
int
(
num_gpu_blocks
=
int
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment