[Platform] platform agnostic for EngineArgs initialization (#11225)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>

[Platform] platform agnostic for EngineArgs initialization (#11225)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
e88db68c · wangxiyuan · GitHub · 59c9b6eb · e88db68c · e88db68c
Unverified Commit e88db68c authored Dec 17, 2024 by wangxiyuan Committed by GitHub Dec 16, 2024
9 changed files
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -112,9 +112,7 @@ class EngineArgs:
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
    max_parallel_loading_workers: Optional[int] = None
-    # NOTE(kzawora): default block size for Gaudi should be 128
-    # smaller sizes still work, but very inefficiently
-    block_size: int = 16 if not current_platform.is_hpu() else 128
+    block_size: Optional[int] = None
    enable_prefix_caching: Optional[bool] = None
    disable_sliding_window: bool = False
    use_v2_block_manager: bool = True
@@ -1036,9 +1034,7 @@ class EngineArgs:
            self.enable_prefix_caching = False

        cache_config = CacheConfig(
-            # neuron needs block_size = max_model_len
-            block_size=self.block_size if self.device != "neuron" else
-            (self.max_model_len if self.max_model_len is not None else 0),
+            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
            swap_space=self.swap_space,
            cache_dtype=self.kv_cache_dtype,

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -60,6 +60,9 @@ class CpuPlatform(Platform):

        cache_config = vllm_config.cache_config

+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE

        if kv_cache_space >= 0:

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -137,6 +137,10 @@ class CudaPlatformBase(Platform):
                else:
                    parallel_config.worker_cls = "vllm.worker.worker.Worker"

+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+

 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,

--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -48,6 +48,12 @@ class HpuPlatform(Platform):
        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"

+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+
    @classmethod
    def is_pin_memory_available(cls):
        logger.warning("Pin memory is not supported on HPU.")

--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -33,6 +33,12 @@ class NeuronPlatform(Platform):
            parallel_config.worker_cls = \
                "vllm.worker.neuron_worker.NeuronWorker"

+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+
    @classmethod
    def is_pin_memory_available(cls) -> bool:
        logger.warning("Pin memory is not supported on Neuron.")

--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -87,6 +87,9 @@ class OpenVinoPlatform(Platform):
        # check and update cache config
        ov_core = ov.Core()
        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
            if not OpenVinoPlatform.is_openvino_cpu():
                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -84,6 +84,10 @@ class RocmPlatform(Platform):

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
        parallel_config = vllm_config.parallel_config
        scheduler_config = vllm_config.scheduler_config
        if parallel_config.worker_cls == "auto":

--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -46,6 +46,11 @@ class TpuPlatform(Platform):
    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
        compilation_config = vllm_config.compilation_config
        if compilation_config.level == CompilationLevel.NO_COMPILATION:
            # TPU does not support NO_COMPILATION

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -51,6 +51,10 @@ class XPUPlatform(Platform):

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
        # check and update model config
        model_config = vllm_config.model_config
        if model_config.dtype == torch.bfloat16: