support --no-enable-chunked-prefill of v1

573531eb · zhuwenwen · 33f37e9f · 573531eb · 573531eb
Commit 573531eb authored Oct 15, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 0 deletions

vllm/config/model.py vllm/config/model.py +4 -0

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +9 -0

No files found.
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -276,6 +276,9 @@ class ModelConfig:
    override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
    v0.12.0 or v1.0.0, whichever is sooner."""
+    enable_chunked_prefill: Optional[bool] = None
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""
    # Multimodal config and init vars
    multimodal_config: Optional[MultiModalConfig] = None
@@ -320,6 +323,7 @@ class ModelConfig:
        factors.append(self.rope_scaling)
        factors.append(self.rope_theta)
        factors.append(self.video_pruning_rate)
+        factors.append(self.enable_chunked_prefill)
        # hf_config can control how the model looks!
        try:

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1055,6 +1055,7 @@ class EngineArgs:
            logits_processors=self.logits_processors,
            video_pruning_rate=self.video_pruning_rate,
            io_processor_plugin=self.io_processor_plugin,
+            enable_chunked_prefill=self.enable_chunked_prefill,
        )
    def validate_tensorizer_args(self):
@@ -1561,6 +1562,10 @@ class EngineArgs:
        if model_config.runner_type != "pooling":
            self.enable_chunked_prefill = True
+            if model_config.enable_chunked_prefill is not None and \
+                model_config.enable_chunked_prefill is False:
+                self.enable_chunked_prefill = False
            # TODO: When prefix caching supports prompt embeds inputs, this
            # check can be removed.
            if (self.enable_prompt_embeds
@@ -1584,6 +1589,10 @@ class EngineArgs:
            action = "Enabling" if \
                incremental_prefill_supported else "Disabling"
+            if model_config.enable_chunked_prefill is not None and \
+                model_config.enable_chunked_prefill is False:
+                self.enable_chunked_prefill = False
            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = incremental_prefill_supported
                logger.info("(%s) chunked prefill by default", action)