Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
aedba6d5
Unverified
Commit
aedba6d5
authored
May 23, 2023
by
Woosuk Kwon
Committed by
GitHub
May 23, 2023
Browse files
Print warnings/errors for large swap space (#123)
parent
a283ec2e
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
34 additions
and
0 deletions
+34
-0
cacheflow/config.py
cacheflow/config.py
+31
-0
cacheflow/server/llm_server.py
cacheflow/server/llm_server.py
+1
-0
cacheflow/utils.py
cacheflow/utils.py
+2
-0
No files found.
cacheflow/config.py
View file @
aedba6d5
...
...
@@ -3,6 +3,11 @@ from typing import Optional
import
torch
from
transformers
import
AutoConfig
,
PretrainedConfig
from
cacheflow.logger
import
init_logger
from
cacheflow.utils
import
get_cpu_memory
logger
=
init_logger
(
__name__
)
_GiB
=
1
<<
30
...
...
@@ -73,11 +78,37 @@ class CacheConfig:
self
.
block_size
=
block_size
self
.
gpu_memory_utilization
=
gpu_memory_utilization
self
.
swap_space_bytes
=
swap_space
*
_GiB
self
.
_verify_args
()
# Will be set after profiling.
self
.
num_gpu_blocks
=
None
self
.
num_cpu_blocks
=
None
def
_verify_args
(
self
)
->
None
:
if
self
.
gpu_memory_utilization
>
1.0
:
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
f
"
{
self
.
gpu_memory_utilization
}
."
)
def
verify_with_parallel_config
(
self
,
parallel_config
:
"ParallelConfig"
,
)
->
None
:
total_cpu_memory
=
get_cpu_memory
()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node
=
parallel_config
.
tensor_parallel_size
cpu_memory_usage
=
self
.
swap_space_bytes
*
num_gpus_per_node
msg
=
(
f
"
{
cpu_memory_usage
/
_GiB
:.
2
f
}
GiB out of "
f
"the
{
total_cpu_memory
/
_GiB
:.
2
f
}
GiB total CPU memory is "
"allocated for the swap space."
)
if
cpu_memory_usage
>
0.7
*
total_cpu_memory
:
raise
ValueError
(
"Too large swap space. "
+
msg
)
elif
cpu_memory_usage
>
0.4
*
total_cpu_memory
:
logger
.
warn
(
"Possibly too large swap space. "
+
msg
)
class
ParallelConfig
:
...
...
cacheflow/server/llm_server.py
View file @
aedba6d5
...
...
@@ -84,6 +84,7 @@ class LLMServer:
def
_verify_args
(
self
)
->
None
:
self
.
model_config
.
verify_with_parallel_config
(
self
.
parallel_config
)
self
.
cache_config
.
verify_with_parallel_config
(
self
.
parallel_config
)
def
_init_cache
(
self
)
->
None
:
# Get the maximum number of blocks that can be allocated on GPU and CPU.
...
...
cacheflow/utils.py
View file @
aedba6d5
...
...
@@ -24,8 +24,10 @@ class Counter:
def
get_gpu_memory
(
gpu
:
int
=
0
)
->
int
:
"""Returns the total memory of the GPU in bytes."""
return
torch
.
cuda
.
get_device_properties
(
gpu
).
total_memory
def
get_cpu_memory
()
->
int
:
"""Returns the total CPU memory of the node in bytes."""
return
psutil
.
virtual_memory
().
total
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment