Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
da65bec3
Unverified
Commit
da65bec3
authored
Aug 22, 2025
by
Shiyan Deng
Committed by
GitHub
Aug 22, 2025
Browse files
add an env var for path to pre-downloaded flashinfer cubin files (#22675)
parent
4645024d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
0 deletions
+11
-0
vllm/envs.py
vllm/envs.py
+6
-0
vllm/utils/flashinfer.py
vllm/utils/flashinfer.py
+5
-0
No files found.
vllm/envs.py
View file @
da65bec3
...
@@ -158,6 +158,7 @@ if TYPE_CHECKING:
...
@@ -158,6 +158,7 @@ if TYPE_CHECKING:
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
bool
=
False
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
bool
=
False
VLLM_ENABLE_RESPONSES_API_STORE
:
bool
=
False
VLLM_ENABLE_RESPONSES_API_STORE
:
bool
=
False
VLLM_USE_TRTLLM_ATTENTION
:
Optional
[
str
]
=
None
VLLM_USE_TRTLLM_ATTENTION
:
Optional
[
str
]
=
None
VLLM_HAS_FLASHINFER_CUBIN
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
:
bool
=
False
VLLM_TUNED_CONFIG_FOLDER
:
Optional
[
str
]
=
None
VLLM_TUNED_CONFIG_FOLDER
:
Optional
[
str
]
=
None
...
@@ -1105,6 +1106,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1105,6 +1106,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_TRTLLM_ATTENTION"
:
"VLLM_USE_TRTLLM_ATTENTION"
:
lambda
:
os
.
getenv
(
"VLLM_USE_TRTLLM_ATTENTION"
,
None
),
lambda
:
os
.
getenv
(
"VLLM_USE_TRTLLM_ATTENTION"
,
None
),
# If set, it means we pre-downloaded cubin files and flashinfer will
# read the cubin files directly.
"VLLM_HAS_FLASHINFER_CUBIN"
:
lambda
:
os
.
getenv
(
"VLLM_HAS_FLASHINFER_CUBIN"
,
False
),
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
# vllm cutlass GEMM, marlin GEMM.
# vllm cutlass GEMM, marlin GEMM.
...
...
vllm/utils/flashinfer.py
View file @
da65bec3
...
@@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool:
...
@@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool:
This checks connectivity to the kernel inference library artifactory
This checks connectivity to the kernel inference library artifactory
which is required for downloading certain cubin kernels like TRTLLM FHMA.
which is required for downloading certain cubin kernels like TRTLLM FHMA.
"""
"""
# Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
# it's true, we could assume the cubins are available.
if
envs
.
VLLM_HAS_FLASHINFER_CUBIN
:
return
True
try
:
try
:
# Use a short timeout to avoid blocking for too long
# Use a short timeout to avoid blocking for too long
response
=
requests
.
get
(
FLASHINFER_CUBINS_REPOSITORY
,
timeout
=
5
)
response
=
requests
.
get
(
FLASHINFER_CUBINS_REPOSITORY
,
timeout
=
5
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment