Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6f5e6533
Unverified
Commit
6f5e6533
authored
Jan 07, 2026
by
Ning Xie
Committed by
GitHub
Jan 06, 2026
Browse files
[Log] add log about gpu worker init snapshot and requested memory (#29493)
Signed-off-by:
Andy Xie
<
andy.xning@gmail.com
>
parent
22dffca9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
42 deletions
+59
-42
vllm/utils/mem_utils.py
vllm/utils/mem_utils.py
+20
-4
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+30
-30
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+9
-8
No files found.
vllm/utils/mem_utils.py
View file @
6f5e6533
...
@@ -14,6 +14,10 @@ import torch.types
...
@@ -14,6 +14,10 @@ import torch.types
from
.mem_constants
import
GiB_bytes
from
.mem_constants
import
GiB_bytes
def
format_gib
(
b
:
int
)
->
float
:
return
round
(
b
/
GiB_bytes
,
2
)
@
cache
@
cache
def
get_max_shared_memory_bytes
(
gpu
:
int
=
0
)
->
int
:
def
get_max_shared_memory_bytes
(
gpu
:
int
=
0
)
->
int
:
"""Returns the maximum shared memory per thread block in bytes."""
"""Returns the maximum shared memory per thread block in bytes."""
...
@@ -146,6 +150,18 @@ class MemorySnapshot:
...
@@ -146,6 +150,18 @@ class MemorySnapshot:
auto_measure
=
False
,
auto_measure
=
False
,
)
)
def
__repr__
(
self
)
->
str
:
return
(
f
"torch_peak=
{
format_gib
(
self
.
torch_peak
)
}
GiB, "
f
"free_memory=
{
format_gib
(
self
.
free_memory
)
}
GiB, "
f
"total_memory=
{
format_gib
(
self
.
total_memory
)
}
GiB, "
f
"cuda_memory=
{
format_gib
(
self
.
cuda_memory
)
}
GiB, "
f
"torch_memory=
{
format_gib
(
self
.
torch_memory
)
}
GiB, "
f
"non_torch_memory=
{
format_gib
(
self
.
non_torch_memory
)
}
GiB, "
f
"timestamp=
{
self
.
timestamp
}
, "
f
"auto_measure=
{
self
.
auto_measure
}
"
)
@
dataclass
@
dataclass
class
MemoryProfilingResult
:
class
MemoryProfilingResult
:
...
@@ -168,12 +184,12 @@ class MemoryProfilingResult:
...
@@ -168,12 +184,12 @@ class MemoryProfilingResult:
return
(
return
(
f
"Memory profiling takes
{
self
.
profile_time
:.
2
f
}
seconds. "
f
"Memory profiling takes
{
self
.
profile_time
:.
2
f
}
seconds. "
f
"Total non KV cache memory: "
f
"Total non KV cache memory: "
f
"
{
(
self
.
non_kv_cache_memory
/
GiB_bytes
):.
2
f
}
GiB; "
f
"
{
format_gib
(
self
.
non_kv_cache_memory
)
}
GiB; "
f
"torch peak memory increase: "
f
"torch peak memory increase: "
f
"
{
(
self
.
torch_peak_increase
/
GiB_bytes
):.
2
f
}
GiB; "
f
"
{
format_gib
(
self
.
torch_peak_increase
)
}
GiB; "
f
"non-torch forward increase memory: "
f
"non-torch forward increase memory: "
f
"
{
(
self
.
non_torch_increase
/
GiB_bytes
):.
2
f
}
GiB; "
f
"
{
format_gib
(
self
.
non_torch_increase
)
}
GiB; "
f
"weights memory:
{
(
self
.
weights_memory
/
GiB_bytes
):.
2
f
}
GiB."
f
"weights memory:
{
format_gib
(
self
.
weights_memory
)
}
GiB."
)
)
...
...
vllm/v1/worker/gpu_worker.py
View file @
6f5e6533
...
@@ -40,8 +40,7 @@ from vllm.platforms import current_platform
...
@@ -40,8 +40,7 @@ from vllm.platforms import current_platform
from
vllm.profiler.wrapper
import
CudaProfilerWrapper
,
TorchProfilerWrapper
from
vllm.profiler.wrapper
import
CudaProfilerWrapper
,
TorchProfilerWrapper
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.tasks
import
SupportedTask
from
vllm.tasks
import
SupportedTask
from
vllm.utils.mem_constants
import
GiB_bytes
from
vllm.utils.mem_utils
import
MemorySnapshot
,
format_gib
,
memory_profiling
from
vllm.utils.mem_utils
import
MemorySnapshot
,
memory_profiling
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.core.sched.output
import
GrammarOutput
,
SchedulerOutput
from
vllm.v1.core.sched.output
import
GrammarOutput
,
SchedulerOutput
from
vllm.v1.engine
import
ReconfigureDistributedRequest
,
ReconfigureRankType
from
vllm.v1.engine
import
ReconfigureDistributedRequest
,
ReconfigureRankType
...
@@ -132,9 +131,9 @@ class Worker(WorkerBase):
...
@@ -132,9 +131,9 @@ class Worker(WorkerBase):
used_bytes
=
total
-
free_bytes_after_sleep
used_bytes
=
total
-
free_bytes_after_sleep
assert
freed_bytes
>=
0
,
"Memory usage increased after sleeping."
assert
freed_bytes
>=
0
,
"Memory usage increased after sleeping."
logger
.
info
(
logger
.
info
(
"Sleep mode freed %
.2
f GiB memory, %
.2
f GiB memory is still in use."
,
"Sleep mode freed %f GiB memory, %f GiB memory is still in use."
,
f
reed_bytes
/
GiB
_bytes
,
f
ormat_gib
(
freed
_bytes
)
,
used_bytes
/
GiB_bytes
,
format_gib
(
used_bytes
)
,
)
)
def
wake_up
(
self
,
tags
:
list
[
str
]
|
None
=
None
)
->
None
:
def
wake_up
(
self
,
tags
:
list
[
str
]
|
None
=
None
)
->
None
:
...
@@ -239,6 +238,10 @@ class Worker(WorkerBase):
...
@@ -239,6 +238,10 @@ class Worker(WorkerBase):
# take current memory snapshot
# take current memory snapshot
self
.
init_snapshot
=
init_snapshot
=
MemorySnapshot
(
device
=
self
.
device
)
self
.
init_snapshot
=
init_snapshot
=
MemorySnapshot
(
device
=
self
.
device
)
self
.
requested_memory
=
request_memory
(
init_snapshot
,
self
.
cache_config
)
self
.
requested_memory
=
request_memory
(
init_snapshot
,
self
.
cache_config
)
logger
.
debug
(
"worker init memory snapshot: %r"
,
self
.
init_snapshot
)
logger
.
debug
(
"worker requested memory: %sGiB"
,
format_gib
(
self
.
requested_memory
)
)
else
:
else
:
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
...
@@ -293,15 +296,14 @@ class Worker(WorkerBase):
...
@@ -293,15 +296,14 @@ class Worker(WorkerBase):
You may limit the usage of GPU memory
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
by adjusting the `gpu_memory_utilization` parameter.
"""
"""
GiB
=
lambda
b
:
b
/
GiB_bytes
if
kv_cache_memory_bytes
:
=
self
.
cache_config
.
kv_cache_memory_bytes
:
if
kv_cache_memory_bytes
:
=
self
.
cache_config
.
kv_cache_memory_bytes
:
# still need a profile run which compiles the model for
# still need a profile run which compiles the model for
# max_num_batched_tokens
# max_num_batched_tokens
self
.
model_runner
.
profile_run
()
self
.
model_runner
.
profile_run
()
msg
=
(
msg
=
(
f
"Initial free memory
{
GiB
(
self
.
init_snapshot
.
free_memory
)
:.
2
f
}
"
f
"Initial free memory
{
format_gib
(
self
.
init_snapshot
.
free_memory
)
}
"
f
"GiB, reserved
{
GiB
(
kv_cache_memory_bytes
)
:.
2
f
}
GiB memory for "
f
"GiB, reserved
{
format_gib
(
kv_cache_memory_bytes
)
}
GiB memory for "
"KV Cache as specified by kv_cache_memory_bytes config and "
"KV Cache as specified by kv_cache_memory_bytes config and "
"skipped memory profiling. This does not respect the "
"skipped memory profiling. This does not respect the "
"gpu_memory_utilization config. Only use kv_cache_memory_bytes "
"gpu_memory_utilization config. Only use kv_cache_memory_bytes "
...
@@ -333,8 +335,8 @@ class Worker(WorkerBase):
...
@@ -333,8 +335,8 @@ class Worker(WorkerBase):
# GPU did not change their memory usage during the profiling.
# GPU did not change their memory usage during the profiling.
assert
self
.
init_snapshot
.
free_memory
>
free_gpu_memory
,
(
assert
self
.
init_snapshot
.
free_memory
>
free_gpu_memory
,
(
"Error in memory profiling. "
"Error in memory profiling. "
f
"Initial free memory
{
GiB
(
self
.
init_snapshot
.
free_memory
)
}
GiB, "
f
"Initial free memory
{
format_gib
(
self
.
init_snapshot
.
free_memory
)
}
GiB, "
f
"current free memory
{
GiB
(
free_gpu_memory
)
}
GiB. "
f
"current free memory
{
format_gib
(
free_gpu_memory
)
}
GiB. "
"This happens when other processes sharing the same container "
"This happens when other processes sharing the same container "
"release GPU memory while vLLM is profiling during initialization. "
"release GPU memory while vLLM is profiling during initialization. "
"To fix this, ensure consistent GPU memory allocation or "
"To fix this, ensure consistent GPU memory allocation or "
...
@@ -346,21 +348,20 @@ class Worker(WorkerBase):
...
@@ -346,21 +348,20 @@ class Worker(WorkerBase):
unrequested_memory
=
self
.
init_snapshot
.
free_memory
-
self
.
requested_memory
unrequested_memory
=
self
.
init_snapshot
.
free_memory
-
self
.
requested_memory
logger
.
debug
(
logger
.
debug
(
"Initial free memory: %
.2
f GiB; Requested memory: %
.2
f (util), %
.2
f GiB"
,
"Initial free memory: %f GiB; Requested memory: %f (util), %f GiB"
,
GiB
(
self
.
init_snapshot
.
free_memory
),
format_gib
(
self
.
init_snapshot
.
free_memory
),
self
.
cache_config
.
gpu_memory_utilization
,
self
.
cache_config
.
gpu_memory_utilization
,
GiB
(
self
.
requested_memory
),
format_gib
(
self
.
requested_memory
),
)
)
logger
.
debug
(
logger
.
debug
(
"Free memory after profiling: %.2f GiB (total), "
"Free memory after profiling: %f GiB (total), %f GiB (within requested)"
,
"%.2f GiB (within requested)"
,
format_gib
(
free_gpu_memory
),
GiB
(
free_gpu_memory
),
format_gib
(
free_gpu_memory
-
unrequested_memory
),
GiB
(
free_gpu_memory
-
unrequested_memory
),
)
)
logger
.
debug
(
profile_result
)
logger
.
debug
(
profile_result
)
logger
.
info_once
(
logger
.
info_once
(
"Available KV cache memory: %
.2
f GiB"
,
"Available KV cache memory: %f GiB"
,
GiB
(
self
.
available_kv_cache_memory_bytes
),
format_gib
(
self
.
available_kv_cache_memory_bytes
),
scope
=
"local"
,
scope
=
"local"
,
)
)
gc
.
collect
()
gc
.
collect
()
...
@@ -467,7 +468,6 @@ class Worker(WorkerBase):
...
@@ -467,7 +468,6 @@ class Worker(WorkerBase):
# CUDAGraph memory size and may not utilize all gpu memory.
# CUDAGraph memory size and may not utilize all gpu memory.
# Users may want fine-grained control to specify kv cache
# Users may want fine-grained control to specify kv cache
# memory size.
# memory size.
GiB
=
lambda
b
:
round
(
b
/
GiB_bytes
,
2
)
# empirically observed that the memory profiling may
# empirically observed that the memory profiling may
# slightly underestimate the memory consumption.
# slightly underestimate the memory consumption.
...
@@ -492,24 +492,24 @@ class Worker(WorkerBase):
...
@@ -492,24 +492,24 @@ class Worker(WorkerBase):
msg
=
(
msg
=
(
f
"Free memory on device "
f
"Free memory on device "
f
"(
{
GiB
(
self
.
init_snapshot
.
free_memory
)
}
/"
f
"(
{
format_gib
(
self
.
init_snapshot
.
free_memory
)
}
/"
f
"
{
GiB
(
self
.
init_snapshot
.
total_memory
)
}
GiB) on startup. "
f
"
{
format_gib
(
self
.
init_snapshot
.
total_memory
)
}
GiB) on startup. "
f
"Desired GPU memory utilization is "
f
"Desired GPU memory utilization is "
f
"(
{
self
.
cache_config
.
gpu_memory_utilization
}
, "
f
"(
{
self
.
cache_config
.
gpu_memory_utilization
}
, "
f
"
{
GiB
(
self
.
requested_memory
)
}
GiB). "
f
"
{
format_gib
(
self
.
requested_memory
)
}
GiB). "
f
"Actual usage is
{
GiB
(
self
.
model_runner
.
model_memory_usage
)
}
"
f
"Actual usage is
{
format_gib
(
self
.
model_runner
.
model_memory_usage
)
}
"
f
"GiB for weight,
{
GiB
(
self
.
peak_activation_memory
)
}
GiB "
f
"GiB for weight,
{
format_gib
(
self
.
peak_activation_memory
)
}
GiB "
f
"for peak activation,
{
GiB
(
self
.
non_torch_memory
)
}
GiB "
f
"for peak activation,
{
format_gib
(
self
.
non_torch_memory
)
}
GiB "
f
"for non-torch memory, and
{
GiB
(
cuda_graph_memory_bytes
)
}
"
f
"for non-torch memory, and
{
format_gib
(
cuda_graph_memory_bytes
)
}
"
f
"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
f
"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
f
"config with `--kv-cache-memory="
f
"config with `--kv-cache-memory="
f
"
{
kv_cache_memory_bytes_to_requested_limit
}
` "
f
"
{
kv_cache_memory_bytes_to_requested_limit
}
` "
f
"(
{
GiB
(
kv_cache_memory_bytes_to_requested_limit
)
}
GiB) to fit "
f
"(
{
format_gib
(
kv_cache_memory_bytes_to_requested_limit
)
}
GiB) to fit "
f
"into requested memory, or `--kv-cache-memory="
f
"into requested memory, or `--kv-cache-memory="
f
"
{
kv_cache_memory_bytes_to_gpu_limit
}
` "
f
"
{
kv_cache_memory_bytes_to_gpu_limit
}
` "
f
"(
{
GiB
(
kv_cache_memory_bytes_to_gpu_limit
)
}
GiB) to fully "
f
"(
{
format_gib
(
kv_cache_memory_bytes_to_gpu_limit
)
}
GiB) to fully "
f
"utilize gpu memory. Current kv cache memory in use is "
f
"utilize gpu memory. Current kv cache memory in use is "
f
"
{
GiB
(
self
.
available_kv_cache_memory_bytes
)
}
GiB."
f
"
{
format_gib
(
self
.
available_kv_cache_memory_bytes
)
}
GiB."
)
)
logger
.
debug
(
msg
)
logger
.
debug
(
msg
)
...
...
vllm/v1/worker/utils.py
View file @
6f5e6533
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
from
collections
import
defaultdict
from
collections
import
defaultdict
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
...
@@ -15,8 +16,7 @@ from vllm.model_executor.models.utils import extract_layer_index
...
@@ -15,8 +16,7 @@ from vllm.model_executor.models.utils import extract_layer_index
from
vllm.multimodal.cache
import
processor_only_cache_from_config
from
vllm.multimodal.cache
import
processor_only_cache_from_config
from
vllm.multimodal.registry
import
MultiModalRegistry
from
vllm.multimodal.registry
import
MultiModalRegistry
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.mem_constants
import
GiB_bytes
from
vllm.utils.mem_utils
import
MemorySnapshot
,
format_gib
from
vllm.utils.mem_utils
import
MemorySnapshot
from
vllm.v1.attention.backends.utils
import
AttentionMetadataBuilder
from
vllm.v1.attention.backends.utils
import
AttentionMetadataBuilder
from
vllm.v1.core.encoder_cache_manager
import
compute_mm_encoder_budget
from
vllm.v1.core.encoder_cache_manager
import
compute_mm_encoder_budget
from
vllm.v1.kv_cache_interface
import
KVCacheGroupSpec
,
KVCacheSpec
from
vllm.v1.kv_cache_interface
import
KVCacheGroupSpec
,
KVCacheSpec
...
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
...
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
return
placeholders
[
is_embed
]
return
placeholders
[
is_embed
]
def
request_memory
(
init_snapshot
:
MemorySnapshot
,
cache_config
:
CacheConfig
)
->
floa
t
:
def
request_memory
(
init_snapshot
:
MemorySnapshot
,
cache_config
:
CacheConfig
)
->
in
t
:
"""
"""
Calculate the amount of memory required by vLLM, then validate
Calculate the amount of memory required by vLLM, then validate
that the current amount of free memory is sufficient for that.
that the current amount of free memory is sufficient for that.
"""
"""
requested_memory
=
init_snapshot
.
total_memory
*
cache_config
.
gpu_memory_utilization
requested_memory
=
math
.
ceil
(
init_snapshot
.
total_memory
*
cache_config
.
gpu_memory_utilization
)
if
init_snapshot
.
free_memory
<
requested_memory
:
if
init_snapshot
.
free_memory
<
requested_memory
:
GiB
=
lambda
b
:
round
(
b
/
GiB_bytes
,
2
)
raise
ValueError
(
raise
ValueError
(
f
"Free memory on device
{
init_snapshot
.
device_
}
"
f
"Free memory on device
{
init_snapshot
.
device_
}
"
f
"(
{
GiB
(
init_snapshot
.
free_memory
)
}
/"
f
"(
{
format_gib
(
init_snapshot
.
free_memory
)
}
/"
f
"
{
GiB
(
init_snapshot
.
total_memory
)
}
GiB) on startup "
f
"
{
format_gib
(
init_snapshot
.
total_memory
)
}
GiB) on startup "
f
"is less than desired GPU memory utilization "
f
"is less than desired GPU memory utilization "
f
"(
{
cache_config
.
gpu_memory_utilization
}
, "
f
"(
{
cache_config
.
gpu_memory_utilization
}
, "
f
"
{
GiB
(
requested_memory
)
}
GiB). Decrease GPU memory "
f
"
{
format_gib
(
requested_memory
)
}
GiB). Decrease GPU memory "
f
"utilization or reduce GPU memory used by other processes."
f
"utilization or reduce GPU memory used by other processes."
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment