support --no-enable-prefix-caching

ea0ccfe6 · zhuwenwen · 7a5df8f7 · ea0ccfe6 · ea0ccfe6
Commit ea0ccfe6 authored Aug 26, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 0 deletions

vllm/config/__init__.py vllm/config/__init__.py +4 -0

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +7 -0

No files found.
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -470,6 +470,9 @@ class ModelConfig:
    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
    """One or more logits processors' fully-qualified class names or class
    definitions"""
+    enable_chunked_prefill: Optional[bool] = None
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""

    def compute_hash(self) -> str:
        """
@@ -500,6 +503,7 @@ class ModelConfig:
        factors.append(self.rope_theta)
        # hf_config can control how the model looks!
        factors.append(self.hf_config.to_json_string())
+        factors.append(self.enable_chunked_prefill)
        str_factors = str(factors)
        assert_hashable(str_factors)
        return hashlib.sha256(str(factors).encode()).hexdigest()

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1590,6 +1590,9 @@ class EngineArgs:
        # For pooling tasks the default is False
        if model_config.runner_type != "pooling":
            self.enable_chunked_prefill = True
+            if model_config.enable_chunked_prefill is not None and \
+                model_config.enable_chunked_prefill is False:
+                self.enable_chunked_prefill = False
            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = True
        else:
@@ -1603,6 +1606,10 @@ class EngineArgs:
            action = "Enabling" if \
                incremental_prefill_supported else "Disabling"
                
+            if model_config.enable_chunked_prefill is not None and \
+                model_config.enable_chunked_prefill is False:
+                self.enable_chunked_prefill = False
+
            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = incremental_prefill_supported
                logger.info("(%s) chunked prefill by default", action)