[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).

[Core] Deprecating block manager v1 and make block manager v2 default (#8704)
Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
81ede99c · Kuntai Du · GitHub · 5eda21e7 · 81ede99c · 81ede99c
Unverified Commit 81ede99c authored Oct 17, 2024 by Kuntai Du Committed by GitHub Oct 17, 2024
5 changed files
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -312,9 +312,7 @@ class Scheduler:
        # LoRAs. This should be improved in the future.
        self.lora_config = lora_config
-        version = "v1"
+        version = "selfattn"
-        if self.scheduler_config.use_v2_block_manager:
-            version = "v2"
        if (self.scheduler_config.embedding_mode
                or self.cache_config.is_attention_free):
            version = "placeholder"

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -373,12 +373,13 @@ class EngineArgs:
                            action='store_true',
                            help='Disables sliding window, '
                            'capping to sliding window size')
-        parser.add_argument(
+        parser.add_argument('--use-v2-block-manager',
-            '--use-v2-block-manager',
-            default=EngineArgs.use_v2_block_manager,
                            action='store_true',
-            help='Use BlockSpaceMangerV2. By default this is set to True. '
+                            help='[DEPRECATED] block manager v1 has been '
-            'Set to False to use BlockSpaceManagerV1')
+                            'removed and SelfAttnBlockSpaceManager (i.e. '
+                            'block manager v2) is now the default. '
+                            'Setting this flag to True or False'
+                            ' has no effect on vLLM behavior.')
        parser.add_argument(
            '--num-lookahead-slots',
            type=int,
@@ -969,12 +970,6 @@ class EngineArgs:
                "in low performance due to small KV cache space. Consider "
                "setting --max-model-len to a smaller value.", max_model_len)
-        if self.num_scheduler_steps > 1 and not self.use_v2_block_manager:
-            self.use_v2_block_manager = True
-            logger.warning(
-                "Enabled BlockSpaceManagerV2 because it is "
-                "required for multi-step (--num-scheduler-steps > 1)")
        speculative_config = SpeculativeConfig.maybe_create_spec_config(
            target_model_config=model_config,
            target_parallel_config=parallel_config,
@@ -990,7 +985,6 @@ class EngineArgs:
            speculative_disable_by_batch_size,
            speculative_max_model_len=self.speculative_max_model_len,
            enable_chunked_prefill=self.enable_chunked_prefill,
-            use_v2_block_manager=self.use_v2_block_manager,
            disable_log_stats=self.disable_log_stats,
            ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
            ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
@@ -1021,11 +1015,20 @@ class EngineArgs:
            if speculative_config is None \
            else speculative_config.num_lookahead_slots
+        if not self.use_v2_block_manager:
+            logger.warning(
+                "[DEPRECATED] Block manager v1 has been removed, "
+                "and setting --use-v2-block-manager to True or False has "
+                "no effect on vLLM behavior. Please remove "
+                "--use-v2-block-manager in your engine argument. "
+                "If your use case is not supported by "
+                "SelfAttnBlockSpaceManager (i.e. block manager v2),"
+                " please file an issue with detailed information.")
        scheduler_config = SchedulerConfig(
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
-            use_v2_block_manager=self.use_v2_block_manager,
            num_lookahead_slots=num_lookahead_slots,
            delay_factor=self.scheduler_delay_factor,
            enable_chunked_prefill=self.enable_chunked_prefill,
@@ -1081,13 +1084,6 @@ class EngineArgs:
            or "all" in detailed_trace_modules,
        )
-        if (model_config.get_sliding_window() is not None
-                and scheduler_config.chunked_prefill_enabled
-                and not scheduler_config.use_v2_block_manager):
-            raise ValueError(
-                "Chunked prefill is not supported with sliding window. "
-                "Set --disable-sliding-window to disable sliding window.")
        return EngineConfig(
            model_config=model_config,
            cache_config=cache_config,

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -247,7 +247,7 @@ class LLMEngine:
            "enforce_eager=%s, kv_cache_dtype=%s, "
            "quantization_param_path=%s, device_config=%s, "
            "decoding_config=%r, observability_config=%r, "
-            "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
+            "seed=%d, served_model_name=%s, "
            "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
            "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
            "use_async_output_proc=%s, use_cached_outputs=%s, "
@@ -280,7 +280,6 @@ class LLMEngine:
            observability_config,
            model_config.seed,
            model_config.served_model_name,
-            scheduler_config.use_v2_block_manager,
            scheduler_config.num_scheduler_steps,
            scheduler_config.chunked_prefill_enabled,
            scheduler_config.multi_step_stream_outputs,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -64,7 +64,6 @@ if TYPE_CHECKING:
    VLLM_USE_TRITON_AWQ: bool = False
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
    VLLM_SKIP_P2P_CHECK: bool = False
-    VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
    VLLM_TORCH_COMPILE_LEVEL: int = 0
    VLLM_DISABLED_KERNELS: List[str] = []
@@ -427,11 +426,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_SKIP_P2P_CHECK":
    lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
-    # If set, allowing the use of deprecated block manager V1
-    "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
-    lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
-                           ) == "1",
    # List of quantization kernels that should be disabled, used for testing
    # and performance comparisons. Currently only affects MPLinearKernel
    # selection

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -574,17 +574,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            # paged attn. We can remove it if we make paged attn kernel
            # to properly handle slinding window attn.
            curr_sliding_window_block = self.sliding_window_blocks
-            if self.scheduler_config.use_v2_block_manager:
            # number of elements in last block
            suff_len = inter_data.seq_lens[seq_idx] % self.block_size
-                sliding_seq_len = min(
+            sliding_seq_len = min(inter_data.seq_lens[seq_idx],
-                    inter_data.seq_lens[seq_idx],
                                  self.block_aligned_sliding_window + suff_len)
            if suff_len > 0:
                curr_sliding_window_block += 1
-            else:
-                sliding_seq_len = min(inter_data.seq_lens[seq_idx],
-                                      self.sliding_window)
        inter_data.curr_sliding_window_blocks[
            seq_idx] = curr_sliding_window_block