update max_seq_len_to_capture

72273242 · zhuwenwen · 267cc5ff · 72273242
Commit 72273242 authored Jul 21, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 4 deletions

vllm/config.py vllm/config.py +7 -4

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -313,7 +313,8 @@ class ModelConfig:
    graph and always execute the model in eager mode. If False, we will use
    CUDA graph and eager execution in hybrid for maximal performance and
    flexibility."""
-    max_seq_len_to_capture: int = 8192
+    # max_seq_len_to_capture: int = 8192
+    max_seq_len_to_capture: bool = None
    """Maximum sequence len covered by CUDA graphs. When a sequence has context
    length larger than this, we fall back to eager mode. Additionally for
    encoder-decoder models, if the sequence length of the encoder input is
@@ -973,9 +974,11 @@ class ModelConfig:
                    "non-quantized models.", self.quantization)

    def _verify_cuda_graph(self) -> None:
-        # self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
-        #                                   self.max_model_len)
+        if self.max_seq_len_to_capture is None:
            self.max_seq_len_to_capture = self.max_model_len
+        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
+                                          self.max_model_len)
+        # self.max_seq_len_to_capture = self.max_model_len
        # CUDAGraph capture not supported for enc-dec models and mllama on ROCm
        ROCM_UNSUPPORTED_MODELS = ['mllama']
        unsupported_rocm = (self.hf_config.model_type