add VLLM_USE_CUDA_GRAPH_SIZES to use 1-18... (not only 1 2 4 8 16)

d126ce21 · zhuwenwen · 77fccdf4 · d126ce21 · d126ce21
Commit d126ce21 authored Nov 21, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 3 deletions

vllm/config.py vllm/config.py +8 -3

vllm/envs.py vllm/envs.py +13 -0

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4761,9 +4761,14 @@ class VllmConfig:
                else:
                    cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes 
                if len(cuda_graph_sizes) == 1:
-                    batch_size_capture_list = [1, 2, 4] + [
-                        i for i in range(8, cuda_graph_sizes[0] + 1, 8)
-                    ]
+                    if not envs.VLLM_USE_CUDA_GRAPH_SIZES:
+                        batch_size_capture_list = [1, 2, 4] + [
+                            i for i in range(8, cuda_graph_sizes[0] + 1, 8)
+                        ]
+                    else:
+                        batch_size_capture_list = list(range(1, 19)) + [24, 32] + [
+                            i for i in range(40, cuda_graph_sizes[0] + 1, 8)
+                        ]
                elif len(cuda_graph_sizes) > 1:
                    batch_size_capture_list = sorted(cuda_graph_sizes)
                else:

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -182,6 +182,7 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP_FILL_MOE_ALIGN: bool = False
    USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT: bool = False
    VLLM_USE_PP_BALANCE: bool = False
+    VLLM_USE_CUDA_GRAPH_SIZES: bool = False

 def get_default_cache_root():
    return os.getenv(
@@ -1151,32 +1152,44 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # vllm pd separation will be used async
    "VLLM_P2P_ASYNC":
    lambda: bool(int(os.getenv("VLLM_P2P_ASYNC", "0"))),
+    
    # pd separation p2p async buf tokens
    "VLLM_P2P_BUF_TOKENS":
    lambda: int(os.getenv("VLLM_P2P_BUF_TOKENS", "30000")),
+    
    # vllm will enable minimal injection for pipeline parallel scheduling
    "VLLM_SCHED_ENABLE_MINIMAL_INJECTION":
        lambda: (os.getenv("VLLM_SCHED_ENABLE_MINIMAL_INJECTION", "0").lower() in
                 ("true", "1")),
+        
    # vLLM will split prefill and decode, not mix up
    "VLLM_USE_PD_SPLIT":
        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "False").lower() in
                 ("true", "1")), 
+        
    # vLLM will sync to avoid pp vmfault
    "VLLM_USE_PP_SYNC":
        lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
                 ("true", "1")),
+        
    # vLLM will use lightop to fuse fill and moe align
    "VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
        lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in
                 ("true", "1")), 
+        
    # vllm will use custom-allreduce rmsquant fused op
    "USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT": 
    lambda: (os.getenv('USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT', '0').lower() in
             ("true", "1")),
+    
    "VLLM_USE_PP_BALANCE":
        lambda: (os.getenv('VLLM_USE_PP_BALANCE', '1').lower() in
                 ("true", "1")),
+    
+    # vllm will use 1-18... (not only 1 2 4 8 16)
+    "VLLM_USE_CUDA_GRAPH_SIZES":
+        lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'False').lower() in
+                 ("true", "1")),
 }

 # --8<-- [end:env-vars-definition]