[Bugfix] fix automatic prefix args and add log info (#3608)

e67c295b · TianYu GUO · GitHub · 925f3332 · e67c295b · e67c295b
Unverified Commit e67c295b authored Mar 25, 2024 by TianYu GUO Committed by GitHub Mar 25, 2024
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

vllm/core/block_manager.py vllm/core/block_manager.py +5 -0

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +2 -1

No files found.
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -9,6 +9,9 @@ from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)


 class BlockAllocatorBase(ABC):
@@ -241,11 +244,13 @@ class BlockSpaceManager:
        self.watermark_blocks = int(watermark * num_gpu_blocks)

        if self.enable_caching:
+            logger.info("enable automatic prefix caching")
            self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size,
                                                      num_gpu_blocks)
            self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size,
                                                      num_cpu_blocks)
        else:
+            logger.info("disable automatic prefix caching")
            self.gpu_allocator = UncachedBlockAllocator(
                Device.GPU, block_size, num_gpu_blocks)
            self.cpu_allocator = UncachedBlockAllocator(

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -337,7 +337,8 @@ class EngineArgs:
        cache_config = CacheConfig(self.block_size,
                                   self.gpu_memory_utilization,
                                   self.swap_space, self.kv_cache_dtype,
-                                   model_config.get_sliding_window())
+                                   model_config.get_sliding_window(),
+                                   self.enable_prefix_caching)
        parallel_config = ParallelConfig(
            self.pipeline_parallel_size, self.tensor_parallel_size,
            self.worker_use_ray, self.max_parallel_loading_workers,