Commit 2c4b2c80 authored by zhuwenwen's avatar zhuwenwen
Browse files

support --no-enable-chunked-prefill of v1

parent f7e9c329
...@@ -418,6 +418,9 @@ class ModelConfig: ...@@ -418,6 +418,9 @@ class ModelConfig:
- "transformers" will use the Transformers model implementation.""" - "transformers" will use the Transformers model implementation."""
override_attention_dtype: Optional[str] = None override_attention_dtype: Optional[str] = None
"""Override dtype for attention""" """Override dtype for attention"""
enable_chunked_prefill: Optional[bool] = None
"""If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens."""
def compute_hash(self) -> str: def compute_hash(self) -> str:
""" """
...@@ -448,6 +451,7 @@ class ModelConfig: ...@@ -448,6 +451,7 @@ class ModelConfig:
factors.append(self.rope_theta) factors.append(self.rope_theta)
# hf_config can control how the model looks! # hf_config can control how the model looks!
factors.append(self.hf_config.to_json_string()) factors.append(self.hf_config.to_json_string())
factors.append(self.enable_chunked_prefill)
str_factors = str(factors) str_factors = str(factors)
assert_hashable(str_factors) assert_hashable(str_factors)
return hashlib.sha256(str(factors).encode()).hexdigest() return hashlib.sha256(str(factors).encode()).hexdigest()
......
...@@ -1004,6 +1004,7 @@ class EngineArgs: ...@@ -1004,6 +1004,7 @@ class EngineArgs:
enable_sleep_mode=self.enable_sleep_mode, enable_sleep_mode=self.enable_sleep_mode,
model_impl=self.model_impl, model_impl=self.model_impl,
override_attention_dtype=self.override_attention_dtype, override_attention_dtype=self.override_attention_dtype,
enable_chunked_prefill=self.enable_chunked_prefill,
) )
def create_load_config(self) -> LoadConfig: def create_load_config(self) -> LoadConfig:
...@@ -1593,6 +1594,9 @@ class EngineArgs: ...@@ -1593,6 +1594,9 @@ class EngineArgs:
# For pooling tasks the default is False # For pooling tasks the default is False
if model_config.runner_type != "pooling": if model_config.runner_type != "pooling":
self.enable_chunked_prefill = True self.enable_chunked_prefill = True
if model_config.enable_chunked_prefill is not None and \
model_config.enable_chunked_prefill is False:
self.enable_chunked_prefill = False
if self.enable_prefix_caching is None: if self.enable_prefix_caching is None:
self.enable_prefix_caching = True self.enable_prefix_caching = True
else: else:
...@@ -1606,6 +1610,10 @@ class EngineArgs: ...@@ -1606,6 +1610,10 @@ class EngineArgs:
action = "Enabling" if \ action = "Enabling" if \
incremental_prefill_supported else "Disabling" incremental_prefill_supported else "Disabling"
if model_config.enable_chunked_prefill is not None and \
model_config.enable_chunked_prefill is False:
self.enable_chunked_prefill = False
if self.enable_chunked_prefill is None: if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = incremental_prefill_supported self.enable_chunked_prefill = incremental_prefill_supported
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment