add VLLM_USE_CUDA_GRAPH_SIZES(1) to use 1-24... (not only 1 2 4 8 16)

set VLLM_USE_LIGHTOP_FILL_MOE_ALIGN=1, VLLM_USE_OPT_ZEROS=1 and VLLM_USE_PP_SYNC=1

add VLLM_USE_CUDA_GRAPH_SIZES(1) to use 1-24... (not only 1 2 4 8 16)
set VLLM_USE_LIGHTOP_FILL_MOE_ALIGN=1, VLLM_USE_OPT_ZEROS=1 and VLLM_USE_PP_SYNC=1
2b47bce9 · zhuwenwen · ce755d66 · 2b47bce9 · 2b47bce9
Commit 2b47bce9 authored Nov 21, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 6 deletions

vllm/config.py vllm/config.py +1 -1

vllm/envs.py vllm/envs.py +5 -5

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4766,7 +4766,7 @@ class VllmConfig:
                            i for i in range(8, cuda_graph_sizes[0] + 1, 8)
                        ]
                    else:
-                        batch_size_capture_list = list(range(1, 19)) + [24, 32] + [
+                        batch_size_capture_list = list(range(1, 25)) + [32] + [
                            i for i in range(40, cuda_graph_sizes[0] + 1, 8)
                        ]
                elif len(cuda_graph_sizes) > 1:

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1114,7 +1114,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
                 ("true", "1")),
    # vLLM will use elenmentwise not triton_
    "VLLM_USE_OPT_ZEROS":
-        lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in
+        lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "True").lower() in
                 ("true", "1")),
    # vLLM will use opt cat for deepseek-v3
    "VLLM_USE_OPT_CAT":
@@ -1170,12 +1170,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # vLLM will sync to avoid pp vmfault
    "VLLM_USE_PP_SYNC":
-        lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
+        lambda: (os.environ.get("VLLM_USE_PP_SYNC", "True").lower() in
                 ("true", "1")),
    # vLLM will use lightop to fuse fill and moe align
    "VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
-        lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in
+        lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "True").lower() in
                 ("true", "1")), 
    # vllm will use custom-allreduce rmsquant fused op
@@ -1191,9 +1191,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
        lambda: (os.getenv('VLLM_USE_ZERO_MTP', '1').lower() in
                 ("true", "1")),
-    # vllm will use 1-18... (not only 1 2 4 8 16)
+    # vllm will use 1-24... (not only 1 2 4 8 16 24)
    "VLLM_USE_CUDA_GRAPH_SIZES":
-        lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'False').lower() in
+        lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'True').lower() in
                 ("true", "1")),
 }