[Core] Consolidate `GB` constant and enable float GB arguments (#7416)

4ddc4743 · Cyrus Leung · GitHub · 6aa33cb2 · 4ddc4743 · 4ddc4743
Unverified Commit 4ddc4743 authored Aug 13, 2024 by Cyrus Leung Committed by GitHub Aug 12, 2024
6 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH,
+from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
                        cuda_device_count_stateless, get_cpu_memory, is_cpu,
                        is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
                        print_warning_once)
@@ -27,7 +27,6 @@ if TYPE_CHECKING:

 logger = init_logger(__name__)

-_GB = 1 << 30
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768

 _PP_SUPPORTED_MODELS = [
@@ -492,7 +491,7 @@ class CacheConfig:
        self,
        block_size: int,
        gpu_memory_utilization: float,
-        swap_space: int,
+        swap_space: float,
        cache_dtype: str,
        num_gpu_blocks_override: Optional[int] = None,
        sliding_window: Optional[int] = None,
@@ -501,7 +500,7 @@ class CacheConfig:
    ) -> None:
        self.block_size = block_size
        self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * _GB
+        self.swap_space_bytes = swap_space * GiB_bytes
        self.num_gpu_blocks_override = num_gpu_blocks_override
        self.cache_dtype = cache_dtype
        self.sliding_window = sliding_window
@@ -561,9 +560,9 @@ class CacheConfig:
        num_gpus_per_node = parallel_config.tensor_parallel_size
        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node

-        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
-               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
-               "allocated for the swap space.")
+        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
+               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
+               "is allocated for the swap space.")
        if cpu_memory_usage > 0.7 * total_cpu_memory:
            raise ValueError("Too large swap space. " + msg)
        elif cpu_memory_usage > 0.4 * total_cpu_memory:

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -58,8 +58,8 @@ class EngineArgs:
    enable_prefix_caching: bool = False
    disable_sliding_window: bool = False
    use_v2_block_manager: bool = False
-    swap_space: int = 4  # GiB
-    cpu_offload_gb: int = 0  # GiB
+    swap_space: float = 4  # GiB
+    cpu_offload_gb: float = 0  # GiB
    gpu_memory_utilization: float = 0.90
    max_num_batched_tokens: Optional[int] = None
    max_num_seqs: int = 256
@@ -321,7 +321,7 @@ class EngineArgs:
                            default=EngineArgs.seed,
                            help='Random seed for operations.')
        parser.add_argument('--swap-space',
-                            type=int,
+                            type=float,
                            default=EngineArgs.swap_space,
                            help='CPU swap space size (GiB) per GPU.')
        parser.add_argument(

--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -13,7 +13,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_open_port,
+from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port,
                        get_vllm_instance_id, make_async)
 from vllm.worker.worker_base import WorkerWrapperBase

@@ -332,7 +332,6 @@ def _verify_and_get_scheduler_config(


 def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
-    _GB = 1 << 30
    if config.enable_prefix_caching:
        logger.warning("Prefix caching is not supported on CPU, disable it.")
        config.enable_prefix_caching = False
@@ -341,11 +340,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:

    if kv_cache_space >= 0:
        if kv_cache_space == 0:
-            config.cpu_kvcache_space_bytes = 4 * _GB  # type: ignore
+            config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
            logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
                           "for CPU backend is not set, using 4 by default.")
        else:
-            config.cpu_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+            config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
    else:
        raise RuntimeError(
            "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"

--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -10,8 +10,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
+from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip,
+                        get_open_port, make_async)

 logger = init_logger(__name__)

@@ -165,14 +165,13 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:

    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
    if kv_cache_space >= 0:
-        _GB = 1 << 30
        if kv_cache_space == 0:
-            config.openvino_kvcache_space_bytes = 4 * _GB  # type: ignore
+            config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
            logger.warning(
                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
                "for OpenVINO backend is not set, using 4 by default.")
        else:
-            config.openvino_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+            config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
    else:
        raise RuntimeError(
            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -115,6 +115,9 @@ STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"

+GiB_bytes = 1 << 30
+"""The number of bytes in one gibibyte (GiB)."""
+
 STR_DTYPE_TO_TORCH_DTYPE = {
    "half": torch.half,
    "bfloat16": torch.bfloat16,

--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -143,8 +143,8 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.

        # Calculate the CPU KV cache size based on the config.
-        num_cpu_blocks = (self.cache_config.swap_space_bytes //
-                          block_size_bytes)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             block_size_bytes)
        num_cpu_blocks = (num_cpu_blocks // 8) * 8  # Round down to 8.
        return num_tpu_blocks, num_cpu_blocks