Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d126ce21
Commit
d126ce21
authored
Nov 21, 2025
by
zhuwenwen
Browse files
add VLLM_USE_CUDA_GRAPH_SIZES to use 1-18... (not only 1 2 4 8 16)
parent
77fccdf4
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
21 additions
and
3 deletions
+21
-3
vllm/config.py
vllm/config.py
+8
-3
vllm/envs.py
vllm/envs.py
+13
-0
No files found.
vllm/config.py
View file @
d126ce21
...
...
@@ -4761,9 +4761,14 @@ class VllmConfig:
else
:
cuda_graph_sizes
=
self
.
scheduler_config
.
cuda_graph_sizes
if
len
(
cuda_graph_sizes
)
==
1
:
if
not
envs
.
VLLM_USE_CUDA_GRAPH_SIZES
:
batch_size_capture_list
=
[
1
,
2
,
4
]
+
[
i
for
i
in
range
(
8
,
cuda_graph_sizes
[
0
]
+
1
,
8
)
]
else
:
batch_size_capture_list
=
list
(
range
(
1
,
19
))
+
[
24
,
32
]
+
[
i
for
i
in
range
(
40
,
cuda_graph_sizes
[
0
]
+
1
,
8
)
]
elif
len
(
cuda_graph_sizes
)
>
1
:
batch_size_capture_list
=
sorted
(
cuda_graph_sizes
)
else
:
...
...
vllm/envs.py
View file @
d126ce21
...
...
@@ -182,6 +182,7 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
:
bool
=
False
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
:
bool
=
False
VLLM_USE_PP_BALANCE
:
bool
=
False
VLLM_USE_CUDA_GRAPH_SIZES
:
bool
=
False
def
get_default_cache_root
():
return
os
.
getenv
(
...
...
@@ -1151,32 +1152,44 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm pd separation will be used async
"VLLM_P2P_ASYNC"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_P2P_ASYNC"
,
"0"
))),
# pd separation p2p async buf tokens
"VLLM_P2P_BUF_TOKENS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_P2P_BUF_TOKENS"
,
"30000"
)),
# vllm will enable minimal injection for pipeline parallel scheduling
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
:
lambda
:
(
os
.
getenv
(
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION"
,
"0"
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will split prefill and decode, not mix up
"VLLM_USE_PD_SPLIT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will sync to avoid pp vmfault
"VLLM_USE_PP_SYNC"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PP_SYNC"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will use lightop to fuse fill and moe align
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use custom-allreduce rmsquant fused op
"USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
"VLLM_USE_PP_BALANCE"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_PP_BALANCE'
,
'1'
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use 1-18... (not only 1 2 4 8 16)
"VLLM_USE_CUDA_GRAPH_SIZES"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_CUDA_GRAPH_SIZES'
,
'False'
).
lower
()
in
(
"true"
,
"1"
)),
}
# --8<-- [end:env-vars-definition]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment