Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
9f659bf0
Unverified
Commit
9f659bf0
authored
Jan 15, 2024
by
Roy
Committed by
GitHub
Jan 14, 2024
Browse files
[Minor] Optimize cuda graph memory usage (#2437)
parent
35c4bc20
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
2 deletions
+10
-2
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+10
-2
No files found.
vllm/worker/model_runner.py
View file @
9f659bf0
...
@@ -506,7 +506,9 @@ class ModelRunner:
...
@@ -506,7 +506,9 @@ class ModelRunner:
"use '--enforce-eager' in the CLI."
)
"use '--enforce-eager' in the CLI."
)
logger
.
info
(
"CUDA graphs can take additional 1~3 GiB memory per GPU. "
logger
.
info
(
"CUDA graphs can take additional 1~3 GiB memory per GPU. "
"If you are running out of memory, consider decreasing "
"If you are running out of memory, consider decreasing "
"`gpu_memory_utilization` or enforcing eager mode."
)
"`gpu_memory_utilization` or enforcing eager mode. "
"You can also reduce the `max_num_seqs` as needed "
"to decrease memory usage."
)
start_time
=
time
.
perf_counter
()
start_time
=
time
.
perf_counter
()
# Prepare dummy inputs. These will be reused for all batch sizes.
# Prepare dummy inputs. These will be reused for all batch sizes.
...
@@ -519,9 +521,15 @@ class ModelRunner:
...
@@ -519,9 +521,15 @@ class ModelRunner:
context_lens
=
torch
.
ones
(
max_batch_size
,
dtype
=
torch
.
int32
).
cuda
()
context_lens
=
torch
.
ones
(
max_batch_size
,
dtype
=
torch
.
int32
).
cuda
()
block_tables
=
torch
.
from_numpy
(
self
.
graph_block_tables
).
cuda
()
block_tables
=
torch
.
from_numpy
(
self
.
graph_block_tables
).
cuda
()
graph_batch_size
=
_get_graph_batch_size
(
self
.
scheduler_config
.
max_num_seqs
)
batch_size_capture_list
=
[
bs
for
bs
in
_BATCH_SIZES_TO_CAPTURE
if
bs
<=
graph_batch_size
]
# NOTE: Capturing the largest batch size first may help reduce the
# NOTE: Capturing the largest batch size first may help reduce the
# memory usage of CUDA graph.
# memory usage of CUDA graph.
for
batch_size
in
reversed
(
_BATCH_SIZES_TO_CAPTURE
):
for
batch_size
in
reversed
(
batch_size_capture_list
):
# Create dummy input_metadata.
# Create dummy input_metadata.
input_metadata
=
InputMetadata
(
input_metadata
=
InputMetadata
(
is_prompt
=
False
,
is_prompt
=
False
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment