Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
85de0934
Unverified
Commit
85de0934
authored
Jun 29, 2023
by
Zhuohan Li
Committed by
GitHub
Jun 29, 2023
Browse files
[Fix] Do not pin memory when in WSL (#312)
parent
f7229756
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
2 deletions
+17
-2
vllm/utils.py
vllm/utils.py
+5
-0
vllm/worker/cache_engine.py
vllm/worker/cache_engine.py
+12
-2
No files found.
vllm/utils.py
View file @
85de0934
import
enum
import
enum
from
platform
import
uname
import
uuid
import
uuid
import
psutil
import
psutil
...
@@ -36,3 +37,7 @@ def get_cpu_memory() -> int:
...
@@ -36,3 +37,7 @@ def get_cpu_memory() -> int:
def
random_uuid
()
->
str
:
def
random_uuid
()
->
str
:
return
str
(
uuid
.
uuid4
().
hex
)
return
str
(
uuid
.
uuid4
().
hex
)
def
in_wsl
()
->
bool
:
# Reference: https://github.com/microsoft/WSL/issues/4071
return
"microsoft"
in
" "
.
join
(
uname
()).
lower
()
vllm/worker/cache_engine.py
View file @
85de0934
...
@@ -5,6 +5,10 @@ import torch
...
@@ -5,6 +5,10 @@ import torch
from
vllm
import
cache_ops
from
vllm
import
cache_ops
from
vllm.config
import
CacheConfig
,
ModelConfig
,
ParallelConfig
from
vllm.config
import
CacheConfig
,
ModelConfig
,
ParallelConfig
from
vllm.logger
import
init_logger
from
vllm.utils
import
in_wsl
logger
=
init_logger
(
__name__
)
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
...
@@ -85,16 +89,22 @@ class CacheEngine:
...
@@ -85,16 +89,22 @@ class CacheEngine:
cpu_cache
:
List
[
KVCache
]
=
[]
cpu_cache
:
List
[
KVCache
]
=
[]
key_block_shape
=
self
.
get_key_block_shape
()
key_block_shape
=
self
.
get_key_block_shape
()
value_block_shape
=
self
.
get_value_block_shape
()
value_block_shape
=
self
.
get_value_block_shape
()
pin_memory
=
not
in_wsl
()
if
not
pin_memory
:
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger
.
warn
(
"Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance."
)
for
_
in
range
(
self
.
num_layers
):
for
_
in
range
(
self
.
num_layers
):
key_blocks
=
torch
.
empty
(
key_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
key_block_shape
),
size
=
(
self
.
num_cpu_blocks
,
*
key_block_shape
),
dtype
=
self
.
dtype
,
dtype
=
self
.
dtype
,
pin_memory
=
True
,
pin_memory
=
pin_memory
,
)
)
value_blocks
=
torch
.
empty
(
value_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
value_block_shape
),
size
=
(
self
.
num_cpu_blocks
,
*
value_block_shape
),
dtype
=
self
.
dtype
,
dtype
=
self
.
dtype
,
pin_memory
=
True
,
pin_memory
=
pin_memory
,
)
)
cpu_cache
.
append
((
key_blocks
,
value_blocks
))
cpu_cache
.
append
((
key_blocks
,
value_blocks
))
return
cpu_cache
return
cpu_cache
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment