Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
30bad5c4
Unverified
Commit
30bad5c4
authored
Dec 12, 2023
by
Woosuk Kwon
Committed by
GitHub
Dec 12, 2023
Browse files
Fix peak memory profiling (#2031)
parent
3fefe271
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
9 deletions
+3
-9
vllm/utils.py
vllm/utils.py
+0
-5
vllm/worker/worker.py
vllm/worker/worker.py
+3
-4
No files found.
vllm/utils.py
View file @
30bad5c4
...
@@ -40,11 +40,6 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
...
@@ -40,11 +40,6 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
return
int
(
max_shared_mem
)
return
int
(
max_shared_mem
)
def
get_gpu_memory
(
gpu
:
int
=
0
)
->
int
:
"""Returns the total memory of the GPU in bytes."""
return
torch
.
cuda
.
get_device_properties
(
gpu
).
total_memory
def
get_cpu_memory
()
->
int
:
def
get_cpu_memory
()
->
int
:
"""Returns the total CPU memory of the node in bytes."""
"""Returns the total CPU memory of the node in bytes."""
return
psutil
.
virtual_memory
().
total
return
psutil
.
virtual_memory
().
total
...
...
vllm/worker/worker.py
View file @
30bad5c4
...
@@ -13,7 +13,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
...
@@ -13,7 +13,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.utils
import
get_gpu_memory
class
Worker
:
class
Worker
:
...
@@ -81,7 +80,6 @@ class Worker:
...
@@ -81,7 +80,6 @@ class Worker:
# Profile the memory usage of the model and get the maximum number of
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
# cache blocks that can be allocated with the remaining free memory.
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
reset_peak_memory_stats
()
# Execute a forward pass with dummy inputs to profile the memory usage
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
# of the model.
...
@@ -90,8 +88,9 @@ class Worker:
...
@@ -90,8 +88,9 @@ class Worker:
# Calculate the number of blocks that can be allocated with the
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
# profiled peak memory.
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
peak_memory
=
torch
.
cuda
.
max_memory_allocated
()
free_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
total_gpu_memory
=
get_gpu_memory
()
peak_memory
=
total_gpu_memory
-
free_gpu_memory
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
block_size
,
self
.
model_config
,
self
.
parallel_config
)
block_size
,
self
.
model_config
,
self
.
parallel_config
)
num_gpu_blocks
=
int
(
num_gpu_blocks
=
int
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment