Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2d8476e4
Unverified
Commit
2d8476e4
authored
Jun 07, 2025
by
Luka Govedič
Committed by
GitHub
Jun 07, 2025
Browse files
[BugFix][V1] Fix memory profiling bug (#18974)
Signed-off-by:
luka
<
luka@neuralmagic.com
>
parent
88be823d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
53 additions
and
16 deletions
+53
-16
tests/models/test_initialization.py
tests/models/test_initialization.py
+2
-0
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+8
-5
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+43
-11
No files found.
tests/models/test_initialization.py
View file @
2d8476e4
...
@@ -86,6 +86,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
...
@@ -86,6 +86,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
}
if
model_info
.
speculative_model
else
None
,
}
if
model_info
.
speculative_model
else
None
,
trust_remote_code
=
model_info
.
trust_remote_code
,
trust_remote_code
=
model_info
.
trust_remote_code
,
max_model_len
=
model_info
.
max_model_len
,
max_model_len
=
model_info
.
max_model_len
,
# these tests seem to produce leftover memory
gpu_memory_utilization
=
0.80
,
load_format
=
"dummy"
,
load_format
=
"dummy"
,
hf_overrides
=
hf_overrides
,
hf_overrides
=
hf_overrides
,
)
)
tests/v1/sample/test_logprobs.py
View file @
2d8476e4
...
@@ -42,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
...
@@ -42,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
#TODO: enable this once we support it for
#TODO: enable this once we support it for
# prompt logprobs.
# prompt logprobs.
enable_prefix_caching
=
request
.
param
,
enable_prefix_caching
=
request
.
param
,
gpu_memory_utilization
=
0.
5
,
gpu_memory_utilization
=
0.
4
,
# up to 2 alive concurrently
)
as
vllm_model
:
)
as
vllm_model
:
yield
vllm_model
yield
vllm_model
...
@@ -343,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
...
@@ -343,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
runner
=
VllmRunner
(
"facebook/opt-125m"
,
runner
=
VllmRunner
(
max_logprobs
=
1
,
"facebook/opt-125m"
,
enable_prefix_caching
=
False
,
max_logprobs
=
1
,
max_model_len
=
256
)
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.15
,
max_model_len
=
256
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
# should pass
# should pass
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
runner
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
...
...
vllm/v1/worker/gpu_worker.py
View file @
2d8476e4
...
@@ -130,7 +130,20 @@ class Worker(WorkerBase):
...
@@ -130,7 +130,20 @@ class Worker(WorkerBase):
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
gc
.
collect
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
self
.
init_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
self
.
init_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
requested_memory
=
(
total_gpu_memory
*
self
.
cache_config
.
gpu_memory_utilization
)
if
self
.
init_gpu_memory
<
requested_memory
:
GiB
=
lambda
b
:
round
(
b
/
GiB_bytes
,
2
)
raise
ValueError
(
f
"Free memory on device (
{
GiB
(
self
.
init_gpu_memory
)
}
/"
f
"
{
GiB
(
total_gpu_memory
)
}
GiB) on startup is less than "
f
"desired GPU memory utilization "
f
"(
{
self
.
cache_config
.
gpu_memory_utilization
}
, "
f
"
{
GiB
(
requested_memory
)
}
GiB). Decrease GPU memory "
f
"utilization or reduce GPU memory used by other processes."
)
else
:
else
:
raise
RuntimeError
(
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
...
@@ -190,28 +203,47 @@ class Worker(WorkerBase):
...
@@ -190,28 +203,47 @@ class Worker(WorkerBase):
# GPU did not change their memory usage during the profiling.
# GPU did not change their memory usage during the profiling.
assert
self
.
init_gpu_memory
>
free_gpu_memory
,
(
assert
self
.
init_gpu_memory
>
free_gpu_memory
,
(
"Error in memory profiling. "
"Error in memory profiling. "
f
"Initial free memory
{
self
.
init_gpu_memory
}
, current free memory"
f
"Initial free memory
{
self
.
init_gpu_memory
/
GiB_bytes
}
GiB, "
f
"
{
free_gpu_memory
}
. This happens when the GPU memory was "
f
"current free memory
{
free_gpu_memory
/
GiB_bytes
}
GiB. "
"not properly cleaned up before initializing the vLLM instance."
)
f
"This happens when the GPU memory was not properly cleaned up "
f
"before initializing the vLLM instance."
)
# Get the peak memory allocation recorded by torch
# Get the peak memory allocation recorded by torch
peak_memory
=
torch
.
cuda
.
memory_stats
()[
"allocated_bytes.all.peak"
]
peak_torch_memory
=
torch
.
cuda
.
memory_stats
(
)[
"allocated_bytes.all.peak"
]
# Check for any memory left around that may have been allocated on the
# Check for any memory left around that may have been allocated on the
# gpu outside of `torch`. NCCL operations, for example, can use a few
# gpu outside of `torch`. NCCL operations, for example, can use a few
# GB during a forward pass
# GB during a forward pass
.
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
torch_allocated_bytes
=
torch
.
cuda
.
memory_stats
(
torch_allocated_bytes
=
torch
.
cuda
.
memory_stats
(
)[
"allocated_bytes.all.current"
]
)[
"allocated_bytes.all.current"
]
total_allocated_bytes
=
torch
.
cuda
.
mem_get_info
(
)[
1
]
-
torch
.
cuda
.
mem_get_info
()[
0
]
# Reset after emptying torch cache
non_torch_allocations
=
total_allocated_bytes
-
torch_allocated_bytes
free_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
if
non_torch_allocations
>
0
:
peak_memory
+=
non_torch_allocations
# Total forward allocation (current) is equal to the diff in free memory
fwd_alloc_bytes
=
self
.
init_gpu_memory
-
free_gpu_memory
# We assume current non-torch allocation is equal to peak
non_torch_alloc_bytes
=
max
(
0
,
fwd_alloc_bytes
-
torch_allocated_bytes
)
# Total forward allocation (peak) is peak torch + non-torch
peak_memory
=
peak_torch_memory
+
non_torch_alloc_bytes
available_kv_cache_memory
=
(
available_kv_cache_memory
=
(
total_gpu_memory
*
self
.
cache_config
.
gpu_memory_utilization
-
total_gpu_memory
*
self
.
cache_config
.
gpu_memory_utilization
-
peak_memory
)
peak_memory
)
GiB
=
lambda
b
:
b
/
GiB_bytes
logger
.
debug
(
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
"total GPU memory: %.2f GiB"
,
GiB
(
self
.
init_gpu_memory
),
GiB
(
free_gpu_memory
),
GiB
(
total_gpu_memory
))
logger
.
debug
(
"Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
"%.2f GiB, available KVCache memory: %.2f GiB"
,
GiB
(
peak_torch_memory
),
GiB
(
non_torch_alloc_bytes
),
GiB
(
available_kv_cache_memory
))
return
int
(
available_kv_cache_memory
)
return
int
(
available_kv_cache_memory
)
def
get_kv_cache_spec
(
self
)
->
dict
[
str
,
KVCacheSpec
]:
def
get_kv_cache_spec
(
self
)
->
dict
[
str
,
KVCacheSpec
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment