Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4ddc4743
Unverified
Commit
4ddc4743
authored
Aug 13, 2024
by
Cyrus Leung
Committed by
GitHub
Aug 12, 2024
Browse files
[Core] Consolidate `GB` constant and enable float GB arguments (#7416)
parent
6aa33cb2
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
21 additions
and
21 deletions
+21
-21
vllm/config.py
vllm/config.py
+6
-7
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-3
vllm/executor/cpu_executor.py
vllm/executor/cpu_executor.py
+3
-4
vllm/executor/openvino_executor.py
vllm/executor/openvino_executor.py
+4
-5
vllm/utils.py
vllm/utils.py
+3
-0
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+2
-2
No files found.
vllm/config.py
View file @
4ddc4743
...
...
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.tracing
import
is_otel_installed
from
vllm.transformers_utils.config
import
get_config
,
get_hf_text_config
from
vllm.utils
import
(
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
,
from
vllm.utils
import
(
STR_NOT_IMPL_ENC_DEC_CUDAGRAPH
,
GiB_bytes
,
cuda_device_count_stateless
,
get_cpu_memory
,
is_cpu
,
is_hip
,
is_neuron
,
is_openvino
,
is_tpu
,
is_xpu
,
print_warning_once
)
...
...
@@ -27,7 +27,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
_GB
=
1
<<
30
_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS
=
32768
_PP_SUPPORTED_MODELS
=
[
...
...
@@ -492,7 +491,7 @@ class CacheConfig:
self
,
block_size
:
int
,
gpu_memory_utilization
:
float
,
swap_space
:
in
t
,
swap_space
:
floa
t
,
cache_dtype
:
str
,
num_gpu_blocks_override
:
Optional
[
int
]
=
None
,
sliding_window
:
Optional
[
int
]
=
None
,
...
...
@@ -501,7 +500,7 @@ class CacheConfig:
)
->
None
:
self
.
block_size
=
block_size
self
.
gpu_memory_utilization
=
gpu_memory_utilization
self
.
swap_space_bytes
=
swap_space
*
_GB
self
.
swap_space_bytes
=
swap_space
*
GiB_bytes
self
.
num_gpu_blocks_override
=
num_gpu_blocks_override
self
.
cache_dtype
=
cache_dtype
self
.
sliding_window
=
sliding_window
...
...
@@ -561,9 +560,9 @@ class CacheConfig:
num_gpus_per_node
=
parallel_config
.
tensor_parallel_size
cpu_memory_usage
=
self
.
swap_space_bytes
*
num_gpus_per_node
msg
=
(
f
"
{
cpu_memory_usage
/
_GB
:.
2
f
}
GiB out of "
f
"
the
{
total_cpu_memory
/
_GB
:.
2
f
}
GiB total CPU memory
is
"
"allocated for the swap space."
)
msg
=
(
f
"
{
cpu_memory_usage
/
GiB_bytes
:.
2
f
}
GiB out of
the
"
f
"
{
total_cpu_memory
/
GiB_bytes
:.
2
f
}
GiB total CPU memory "
"
is
allocated for the swap space."
)
if
cpu_memory_usage
>
0.7
*
total_cpu_memory
:
raise
ValueError
(
"Too large swap space. "
+
msg
)
elif
cpu_memory_usage
>
0.4
*
total_cpu_memory
:
...
...
vllm/engine/arg_utils.py
View file @
4ddc4743
...
...
@@ -58,8 +58,8 @@ class EngineArgs:
enable_prefix_caching
:
bool
=
False
disable_sliding_window
:
bool
=
False
use_v2_block_manager
:
bool
=
False
swap_space
:
in
t
=
4
# GiB
cpu_offload_gb
:
in
t
=
0
# GiB
swap_space
:
floa
t
=
4
# GiB
cpu_offload_gb
:
floa
t
=
0
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
...
...
@@ -321,7 +321,7 @@ class EngineArgs:
default
=
EngineArgs
.
seed
,
help
=
'Random seed for operations.'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
in
t
,
type
=
floa
t
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU.'
)
parser
.
add_argument
(
...
...
vllm/executor/cpu_executor.py
View file @
4ddc4743
...
...
@@ -13,7 +13,7 @@ from vllm.logger import init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_open_port
,
from
vllm.utils
import
(
GiB_bytes
,
get_distributed_init_method
,
get_open_port
,
get_vllm_instance_id
,
make_async
)
from
vllm.worker.worker_base
import
WorkerWrapperBase
...
...
@@ -332,7 +332,6 @@ def _verify_and_get_scheduler_config(
def
_verify_and_get_cache_config
(
config
:
CacheConfig
)
->
CacheConfig
:
_GB
=
1
<<
30
if
config
.
enable_prefix_caching
:
logger
.
warning
(
"Prefix caching is not supported on CPU, disable it."
)
config
.
enable_prefix_caching
=
False
...
...
@@ -341,11 +340,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
if
kv_cache_space
>=
0
:
if
kv_cache_space
==
0
:
config
.
cpu_kvcache_space_bytes
=
4
*
_GB
# type: ignore
config
.
cpu_kvcache_space_bytes
=
4
*
GiB_bytes
# type: ignore
logger
.
warning
(
"Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
"for CPU backend is not set, using 4 by default."
)
else
:
config
.
cpu_kvcache_space_bytes
=
kv_cache_space
*
_GB
# type: ignore
config
.
cpu_kvcache_space_bytes
=
kv_cache_space
*
GiB_bytes
# type: ignore
else
:
raise
RuntimeError
(
"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
...
...
vllm/executor/openvino_executor.py
View file @
4ddc4743
...
...
@@ -10,8 +10,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
from
vllm.utils
import
(
GiB_bytes
,
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
logger
=
init_logger
(
__name__
)
...
...
@@ -165,14 +165,13 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
kv_cache_space
=
envs
.
VLLM_OPENVINO_KVCACHE_SPACE
if
kv_cache_space
>=
0
:
_GB
=
1
<<
30
if
kv_cache_space
==
0
:
config
.
openvino_kvcache_space_bytes
=
4
*
_GB
# type: ignore
config
.
openvino_kvcache_space_bytes
=
4
*
GiB_bytes
# type: ignore
logger
.
warning
(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
"for OpenVINO backend is not set, using 4 by default."
)
else
:
config
.
openvino_kvcache_space_bytes
=
kv_cache_space
*
_GB
# type: ignore
config
.
openvino_kvcache_space_bytes
=
kv_cache_space
*
GiB_bytes
# type: ignore
else
:
raise
RuntimeError
(
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
...
...
vllm/utils.py
View file @
4ddc4743
...
...
@@ -115,6 +115,9 @@ STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
STR_FLASH_ATTN_VAL
:
str
=
"FLASH_ATTN"
STR_INVALID_VAL
:
str
=
"INVALID"
GiB_bytes
=
1
<<
30
"""The number of bytes in one gibibyte (GiB)."""
STR_DTYPE_TO_TORCH_DTYPE
=
{
"half"
:
torch
.
half
,
"bfloat16"
:
torch
.
bfloat16
,
...
...
vllm/worker/tpu_worker.py
View file @
4ddc4743
...
...
@@ -143,8 +143,8 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
num_tpu_blocks
=
(
num_tpu_blocks
//
8
)
*
8
# Round down to 8.
# Calculate the CPU KV cache size based on the config.
num_cpu_blocks
=
(
self
.
cache_config
.
swap_space_bytes
//
block_size_bytes
)
num_cpu_blocks
=
int
(
self
.
cache_config
.
swap_space_bytes
//
block_size_bytes
)
num_cpu_blocks
=
(
num_cpu_blocks
//
8
)
*
8
# Round down to 8.
return
num_tpu_blocks
,
num_cpu_blocks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment