Unverified Commit 81ede99c authored by Kuntai Du's avatar Kuntai Du Committed by GitHub
Browse files

[Core] Deprecating block manager v1 and make block manager v2 default (#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
parent 5eda21e7
...@@ -312,9 +312,7 @@ class Scheduler: ...@@ -312,9 +312,7 @@ class Scheduler:
# LoRAs. This should be improved in the future. # LoRAs. This should be improved in the future.
self.lora_config = lora_config self.lora_config = lora_config
version = "v1" version = "selfattn"
if self.scheduler_config.use_v2_block_manager:
version = "v2"
if (self.scheduler_config.embedding_mode if (self.scheduler_config.embedding_mode
or self.cache_config.is_attention_free): or self.cache_config.is_attention_free):
version = "placeholder" version = "placeholder"
......
...@@ -373,12 +373,13 @@ class EngineArgs: ...@@ -373,12 +373,13 @@ class EngineArgs:
action='store_true', action='store_true',
help='Disables sliding window, ' help='Disables sliding window, '
'capping to sliding window size') 'capping to sliding window size')
parser.add_argument( parser.add_argument('--use-v2-block-manager',
'--use-v2-block-manager',
default=EngineArgs.use_v2_block_manager,
action='store_true', action='store_true',
help='Use BlockSpaceMangerV2. By default this is set to True. ' help='[DEPRECATED] block manager v1 has been '
'Set to False to use BlockSpaceManagerV1') 'removed and SelfAttnBlockSpaceManager (i.e. '
'block manager v2) is now the default. '
'Setting this flag to True or False'
' has no effect on vLLM behavior.')
parser.add_argument( parser.add_argument(
'--num-lookahead-slots', '--num-lookahead-slots',
type=int, type=int,
...@@ -969,12 +970,6 @@ class EngineArgs: ...@@ -969,12 +970,6 @@ class EngineArgs:
"in low performance due to small KV cache space. Consider " "in low performance due to small KV cache space. Consider "
"setting --max-model-len to a smaller value.", max_model_len) "setting --max-model-len to a smaller value.", max_model_len)
if self.num_scheduler_steps > 1 and not self.use_v2_block_manager:
self.use_v2_block_manager = True
logger.warning(
"Enabled BlockSpaceManagerV2 because it is "
"required for multi-step (--num-scheduler-steps > 1)")
speculative_config = SpeculativeConfig.maybe_create_spec_config( speculative_config = SpeculativeConfig.maybe_create_spec_config(
target_model_config=model_config, target_model_config=model_config,
target_parallel_config=parallel_config, target_parallel_config=parallel_config,
...@@ -990,7 +985,6 @@ class EngineArgs: ...@@ -990,7 +985,6 @@ class EngineArgs:
speculative_disable_by_batch_size, speculative_disable_by_batch_size,
speculative_max_model_len=self.speculative_max_model_len, speculative_max_model_len=self.speculative_max_model_len,
enable_chunked_prefill=self.enable_chunked_prefill, enable_chunked_prefill=self.enable_chunked_prefill,
use_v2_block_manager=self.use_v2_block_manager,
disable_log_stats=self.disable_log_stats, disable_log_stats=self.disable_log_stats,
ngram_prompt_lookup_max=self.ngram_prompt_lookup_max, ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
ngram_prompt_lookup_min=self.ngram_prompt_lookup_min, ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
...@@ -1021,11 +1015,20 @@ class EngineArgs: ...@@ -1021,11 +1015,20 @@ class EngineArgs:
if speculative_config is None \ if speculative_config is None \
else speculative_config.num_lookahead_slots else speculative_config.num_lookahead_slots
if not self.use_v2_block_manager:
logger.warning(
"[DEPRECATED] Block manager v1 has been removed, "
"and setting --use-v2-block-manager to True or False has "
"no effect on vLLM behavior. Please remove "
"--use-v2-block-manager in your engine argument. "
"If your use case is not supported by "
"SelfAttnBlockSpaceManager (i.e. block manager v2),"
" please file an issue with detailed information.")
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_num_batched_tokens=self.max_num_batched_tokens, max_num_batched_tokens=self.max_num_batched_tokens,
max_num_seqs=self.max_num_seqs, max_num_seqs=self.max_num_seqs,
max_model_len=model_config.max_model_len, max_model_len=model_config.max_model_len,
use_v2_block_manager=self.use_v2_block_manager,
num_lookahead_slots=num_lookahead_slots, num_lookahead_slots=num_lookahead_slots,
delay_factor=self.scheduler_delay_factor, delay_factor=self.scheduler_delay_factor,
enable_chunked_prefill=self.enable_chunked_prefill, enable_chunked_prefill=self.enable_chunked_prefill,
...@@ -1081,13 +1084,6 @@ class EngineArgs: ...@@ -1081,13 +1084,6 @@ class EngineArgs:
or "all" in detailed_trace_modules, or "all" in detailed_trace_modules,
) )
if (model_config.get_sliding_window() is not None
and scheduler_config.chunked_prefill_enabled
and not scheduler_config.use_v2_block_manager):
raise ValueError(
"Chunked prefill is not supported with sliding window. "
"Set --disable-sliding-window to disable sliding window.")
return EngineConfig( return EngineConfig(
model_config=model_config, model_config=model_config,
cache_config=cache_config, cache_config=cache_config,
......
...@@ -247,7 +247,7 @@ class LLMEngine: ...@@ -247,7 +247,7 @@ class LLMEngine:
"enforce_eager=%s, kv_cache_dtype=%s, " "enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, " "quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, observability_config=%r, " "decoding_config=%r, observability_config=%r, "
"seed=%d, served_model_name=%s, use_v2_block_manager=%s, " "seed=%d, served_model_name=%s, "
"num_scheduler_steps=%d, chunked_prefill_enabled=%s " "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
"multi_step_stream_outputs=%s, enable_prefix_caching=%s, " "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
"use_async_output_proc=%s, use_cached_outputs=%s, " "use_async_output_proc=%s, use_cached_outputs=%s, "
...@@ -280,7 +280,6 @@ class LLMEngine: ...@@ -280,7 +280,6 @@ class LLMEngine:
observability_config, observability_config,
model_config.seed, model_config.seed,
model_config.served_model_name, model_config.served_model_name,
scheduler_config.use_v2_block_manager,
scheduler_config.num_scheduler_steps, scheduler_config.num_scheduler_steps,
scheduler_config.chunked_prefill_enabled, scheduler_config.chunked_prefill_enabled,
scheduler_config.multi_step_stream_outputs, scheduler_config.multi_step_stream_outputs,
......
...@@ -64,7 +64,6 @@ if TYPE_CHECKING: ...@@ -64,7 +64,6 @@ if TYPE_CHECKING:
VLLM_USE_TRITON_AWQ: bool = False VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False VLLM_SKIP_P2P_CHECK: bool = False
VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
VLLM_TORCH_COMPILE_LEVEL: int = 0 VLLM_TORCH_COMPILE_LEVEL: int = 0
VLLM_DISABLED_KERNELS: List[str] = [] VLLM_DISABLED_KERNELS: List[str] = []
...@@ -427,11 +426,6 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -427,11 +426,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_SKIP_P2P_CHECK": "VLLM_SKIP_P2P_CHECK":
lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1", lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",
# If set, allowing the use of deprecated block manager V1
"VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
) == "1",
# List of quantization kernels that should be disabled, used for testing # List of quantization kernels that should be disabled, used for testing
# and performance comparisons. Currently only affects MPLinearKernel # and performance comparisons. Currently only affects MPLinearKernel
# selection # selection
......
...@@ -574,17 +574,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -574,17 +574,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# paged attn. We can remove it if we make paged attn kernel # paged attn. We can remove it if we make paged attn kernel
# to properly handle slinding window attn. # to properly handle slinding window attn.
curr_sliding_window_block = self.sliding_window_blocks curr_sliding_window_block = self.sliding_window_blocks
if self.scheduler_config.use_v2_block_manager:
# number of elements in last block # number of elements in last block
suff_len = inter_data.seq_lens[seq_idx] % self.block_size suff_len = inter_data.seq_lens[seq_idx] % self.block_size
sliding_seq_len = min( sliding_seq_len = min(inter_data.seq_lens[seq_idx],
inter_data.seq_lens[seq_idx],
self.block_aligned_sliding_window + suff_len) self.block_aligned_sliding_window + suff_len)
if suff_len > 0: if suff_len > 0:
curr_sliding_window_block += 1 curr_sliding_window_block += 1
else:
sliding_seq_len = min(inter_data.seq_lens[seq_idx],
self.sliding_window)
inter_data.curr_sliding_window_blocks[ inter_data.curr_sliding_window_blocks[
seq_idx] = curr_sliding_window_block seq_idx] = curr_sliding_window_block
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment