Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc867be1
Unverified
Commit
cc867be1
authored
Jun 09, 2025
by
Ye (Charlotte) Qi
Committed by
GitHub
Jun 10, 2025
Browse files
[V1] Reuse V0's memory_profiling util for gpu worker memory profiling (#19312)
Signed-off-by:
Ye (Charlotte) Qi
<
yeq@meta.com
>
parent
3a7cd627
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
51 additions
and
53 deletions
+51
-53
vllm/utils.py
vllm/utils.py
+16
-2
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+35
-51
No files found.
vllm/utils.py
View file @
cc867be1
...
@@ -2269,6 +2269,8 @@ def kill_process_tree(pid: int):
...
@@ -2269,6 +2269,8 @@ def kill_process_tree(pid: int):
class
MemorySnapshot
:
class
MemorySnapshot
:
"""Memory snapshot."""
"""Memory snapshot."""
torch_peak
:
int
=
0
torch_peak
:
int
=
0
free_memory
:
int
=
0
total_memory
:
int
=
0
cuda_memory
:
int
=
0
cuda_memory
:
int
=
0
torch_memory
:
int
=
0
torch_memory
:
int
=
0
non_torch_memory
:
int
=
0
non_torch_memory
:
int
=
0
...
@@ -2288,8 +2290,8 @@ class MemorySnapshot:
...
@@ -2288,8 +2290,8 @@ class MemorySnapshot:
self
.
torch_peak
=
torch
.
cuda
.
memory_stats
().
get
(
self
.
torch_peak
=
torch
.
cuda
.
memory_stats
().
get
(
"allocated_bytes.all.peak"
,
0
)
"allocated_bytes.all.peak"
,
0
)
self
.
cuda
_memory
=
torch
.
cuda
.
mem_get_info
(
self
.
free_memory
,
self
.
total
_memory
=
torch
.
cuda
.
mem_get_info
(
)
)[
1
]
-
torch
.
cuda
.
mem
_get_info
()[
0
]
self
.
cuda
_
mem
ory
=
self
.
total_memory
-
self
.
free_memory
# torch.cuda.memory_reserved() is how many bytes
# torch.cuda.memory_reserved() is how many bytes
# PyTorch gets from cuda (by calling cudaMalloc, etc.)
# PyTorch gets from cuda (by calling cudaMalloc, etc.)
...
@@ -2302,6 +2304,8 @@ class MemorySnapshot:
...
@@ -2302,6 +2304,8 @@ class MemorySnapshot:
def
__sub__
(
self
,
other
:
MemorySnapshot
)
->
MemorySnapshot
:
def
__sub__
(
self
,
other
:
MemorySnapshot
)
->
MemorySnapshot
:
return
MemorySnapshot
(
return
MemorySnapshot
(
torch_peak
=
self
.
torch_peak
-
other
.
torch_peak
,
torch_peak
=
self
.
torch_peak
-
other
.
torch_peak
,
free_memory
=
self
.
free_memory
-
other
.
free_memory
,
total_memory
=
self
.
total_memory
-
other
.
total_memory
,
cuda_memory
=
self
.
cuda_memory
-
other
.
cuda_memory
,
cuda_memory
=
self
.
cuda_memory
-
other
.
cuda_memory
,
torch_memory
=
self
.
torch_memory
-
other
.
torch_memory
,
torch_memory
=
self
.
torch_memory
-
other
.
torch_memory
,
non_torch_memory
=
self
.
non_torch_memory
-
other
.
non_torch_memory
,
non_torch_memory
=
self
.
non_torch_memory
-
other
.
non_torch_memory
,
...
@@ -2323,6 +2327,16 @@ class MemoryProfilingResult:
...
@@ -2323,6 +2327,16 @@ class MemoryProfilingResult:
after_profile
:
MemorySnapshot
=
field
(
default_factory
=
MemorySnapshot
)
after_profile
:
MemorySnapshot
=
field
(
default_factory
=
MemorySnapshot
)
profile_time
:
float
=
0.0
profile_time
:
float
=
0.0
def
__repr__
(
self
)
->
str
:
return
(
f
"Memory profiling takes
{
self
.
profile_time
:.
2
f
}
seconds. "
f
"Total non KV cache memory: "
f
"
{
(
self
.
non_kv_cache_memory
/
GiB_bytes
):.
2
f
}
GiB; "
f
"torch peak memory increase: "
f
"
{
(
self
.
torch_peak_increase
/
GiB_bytes
):.
2
f
}
GiB; "
f
"non-torch forward increase memory: "
f
"
{
(
self
.
non_torch_increase
/
GiB_bytes
):.
2
f
}
GiB; "
f
"weights memory:
{
(
self
.
weights_memory
/
GiB_bytes
):.
2
f
}
GiB."
)
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
memory_profiling
(
def
memory_profiling
(
...
...
vllm/v1/worker/gpu_worker.py
View file @
cc867be1
...
@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
...
@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor
import
set_random_seed
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
GiB_bytes
from
vllm.utils
import
GiB_bytes
,
MemorySnapshot
,
memory_profiling
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
,
KVCacheSpec
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
,
KVCacheSpec
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.utils
import
report_usage_stats
from
vllm.v1.utils
import
report_usage_stats
...
@@ -130,20 +130,22 @@ class Worker(WorkerBase):
...
@@ -130,20 +130,22 @@ class Worker(WorkerBase):
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
gc
.
collect
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
self
.
init_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
requested_memory
=
(
total_gpu_memory
*
# take current memory snapshot
self
.
cache_config
.
gpu_memory_utilization
)
self
.
init_snapshot
=
MemorySnapshot
()
if
self
.
init_gpu_memory
<
requested_memory
:
self
.
requested_memory
=
(
self
.
init_snapshot
.
total_memory
*
self
.
cache_config
.
gpu_memory_utilization
)
if
self
.
init_snapshot
.
free_memory
<
self
.
requested_memory
:
GiB
=
lambda
b
:
round
(
b
/
GiB_bytes
,
2
)
GiB
=
lambda
b
:
round
(
b
/
GiB_bytes
,
2
)
raise
ValueError
(
raise
ValueError
(
f
"Free memory on device (
{
GiB
(
self
.
init_gpu_memory
)
}
/"
f
"Free memory on device "
f
"
{
GiB
(
total_gpu_memory
)
}
GiB) on startup is less than "
f
"(
{
GiB
(
self
.
init_snapshot
.
free_memory
)
}
/"
f
"desired GPU memory utilization "
f
"
{
GiB
(
self
.
init_snapshot
.
total_memory
)
}
GiB) on startup "
f
"is less than desired GPU memory utilization "
f
"(
{
self
.
cache_config
.
gpu_memory_utilization
}
, "
f
"(
{
self
.
cache_config
.
gpu_memory_utilization
}
, "
f
"
{
GiB
(
requested_memory
)
}
GiB). Decrease GPU memory "
f
"
{
GiB
(
self
.
requested_memory
)
}
GiB). Decrease GPU memory "
f
"utilization or reduce GPU memory used by other processes."
f
"utilization or reduce GPU memory used by other processes."
)
)
else
:
else
:
raise
RuntimeError
(
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
...
@@ -192,57 +194,39 @@ class Worker(WorkerBase):
...
@@ -192,57 +194,39 @@ class Worker(WorkerBase):
"""
"""
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
reset_peak_memory_stats
()
torch
.
cuda
.
reset_peak_memory_stats
()
GiB
=
lambda
b
:
b
/
GiB_bytes
_
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
# Execute a forward pass with dummy inputs to profile the memory usage
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
# of the model.
self
.
model_runner
.
profile_run
()
with
memory_profiling
(
self
.
init_snapshot
,
weights_memory
=
int
(
self
.
model_runner
.
model_memory_usage
))
as
profile_result
:
self
.
model_runner
.
profile_run
()
free_gpu_memory
,
_
=
torch
.
cuda
.
mem_get_info
()
free_gpu_memory
=
profile_result
.
after_profile
.
free_memory
# NOTE(woosuk): Here we assume that the other processes using the same
# NOTE(woosuk): Here we assume that the other processes using the same
# GPU did not change their memory usage during the profiling.
# GPU did not change their memory usage during the profiling.
assert
self
.
init_
gpu
_memory
>
free_gpu_memory
,
(
assert
self
.
init_
snapshot
.
free
_memory
>
free_gpu_memory
,
(
"Error in memory profiling. "
"Error in memory profiling. "
f
"Initial free memory
{
self
.
init_gpu_memory
/
GiB_bytes
}
GiB, "
f
"Initial free memory
{
GiB
(
self
.
init_snapshot
.
free_memory
)
}
GiB, "
f
"current free memory
{
free_gpu_memory
/
GiB_bytes
}
GiB. "
f
"current free memory
{
GiB
(
free_gpu_memory
)
}
GiB. "
f
"This happens when the GPU memory was not properly cleaned up "
"This happens when other processes sharing the same container "
f
"before initializing the vLLM instance."
)
"release GPU memory while vLLM is profiling during initialization. "
"To fix this, ensure consistent GPU memory allocation or "
# Get the peak memory allocation recorded by torch
"isolate vLLM in its own container."
)
peak_torch_memory
=
torch
.
cuda
.
memory_stats
(
available_kv_cache_memory
=
self
.
requested_memory
\
)[
"allocated_bytes.all.peak"
]
-
profile_result
.
non_kv_cache_memory
# Check for any memory left around that may have been allocated on the
# gpu outside of `torch`. NCCL operations, for example, can use a few
# GB during a forward pass.
torch
.
cuda
.
empty_cache
()
torch_allocated_bytes
=
torch
.
cuda
.
memory_stats
(
)[
"allocated_bytes.all.current"
]
# Reset after emptying torch cache
free_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
# Total forward allocation (current) is equal to the diff in free memory
fwd_alloc_bytes
=
self
.
init_gpu_memory
-
free_gpu_memory
# We assume current non-torch allocation is equal to peak
non_torch_alloc_bytes
=
max
(
0
,
fwd_alloc_bytes
-
torch_allocated_bytes
)
# Total forward allocation (peak) is peak torch + non-torch
peak_memory
=
peak_torch_memory
+
non_torch_alloc_bytes
available_kv_cache_memory
=
(
total_gpu_memory
*
self
.
cache_config
.
gpu_memory_utilization
-
peak_memory
)
GiB
=
lambda
b
:
b
/
GiB_bytes
logger
.
debug
(
logger
.
debug
(
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
"
total
GPU memory: %.2f GiB"
,
GiB
(
self
.
init_gpu_memory
),
"
requested
GPU memory: %.2f GiB"
,
GiB
(
free
_gpu
_memory
),
GiB
(
total
_gpu_memory
)
)
GiB
(
self
.
init_snapshot
.
free_memory
),
GiB
(
free
_gpu_memory
)
,
logger
.
debug
(
GiB
(
self
.
requested_memory
))
"Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
logger
.
debug
(
profile_result
)
"%.2f GiB, a
vailable KV
C
ache memory: %.2f GiB"
,
logger
.
info
(
"A
vailable KV
c
ache memory: %.2f GiB"
,
GiB
(
peak_tor
ch_memory
)
,
GiB
(
non_torch_alloc_bytes
),
GiB
(
available_kv_ca
ch
e
_memory
)
)
GiB
(
available_kv_cache_memory
)
)
gc
.
collect
(
)
return
int
(
available_kv_cache_memory
)
return
int
(
available_kv_cache_memory
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment