Merge branch 'v0.15.1-dev_t2' into 'v0.15.1-dev'

增加max_cudagraph_capture_size，细化capture的范围 See merge request dcutoolkit/deeplearing/vllm!494

Merge branch 'v0.15.1-dev_t2' into 'v0.15.1-dev'
增加max_cudagraph_capture_size，细化capture的范围 See merge request dcutoolkit/deeplearing/vllm!494
5a14b60c · wangmin6 · 8781f412 · 1b71f522 · 5a14b60c · 5a14b60c
Commit 5a14b60c authored Mar 12, 2026 by wangmin6
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 3 deletions

vllm/config/vllm.py vllm/config/vllm.py +8 -3

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1157,9 +1157,14 @@ class VllmConfig:
                # sort to make sure the sizes are in ascending order
                cudagraph_capture_sizes.sort()
            else:
-                cudagraph_capture_sizes = [
-                    i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
-                ]
+                if not envs.VLLM_USE_CUDA_GRAPH_SIZES:
+                    cudagraph_capture_sizes = [
+                        i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
+                    ]
+                else:
+                    cudagraph_capture_sizes = list(range(1, 25)) + [32] + [
+                            i for i in range(40, max_cudagraph_capture_size + 1, 8)
+                        ]
                if max_cudagraph_capture_size >= 8:
                    # Step size 8 for small batch sizes, up to 256(not included)
                    cudagraph_capture_sizes += list(

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -306,6 +306,7 @@ if TYPE_CHECKING:
    VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
    VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE: bool = False
    VLLM_USE_FUSED_DTBMM: bool = False # DOUBLE TRANS BMM FP8
+    VLLM_USE_CUDA_GRAPH_SIZES: bool = False


 def get_default_cache_root():
@@ -1918,6 +1919,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_FUSED_DTBMM":
        lambda: (os.environ.get("VLLM_USE_FUSED_DTBMM", "False").lower() in
                ("true", "1")),
+    # vllm will use 1-24,32,40,48... (not only 1 2 4 8 16)
+    "VLLM_USE_CUDA_GRAPH_SIZES":
+        lambda: (os.getenv("VLLM_USE_CUDA_GRAPH_SIZES", "False").lower() in
+                ("true", "1")),
 }

 # --8<-- [end:env-vars-definition]