Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e3bf79ff
Unverified
Commit
e3bf79ff
authored
Feb 04, 2026
by
Luka Govedič
Committed by
GitHub
Feb 04, 2026
Browse files
Revert "[Attention][FA3] Update FA3 to include new swizzle optimization" (#33841)
parent
fb1270f1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
3 additions
and
13 deletions
+3
-13
cmake/external_projects/vllm_flash_attn.cmake
cmake/external_projects/vllm_flash_attn.cmake
+1
-1
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+1
-6
vllm/v1/attention/backends/mla/flashattn_mla.py
vllm/v1/attention/backends/mla/flashattn_mla.py
+1
-6
No files found.
cmake/external_projects/vllm_flash_attn.cmake
View file @
e3bf79ff
...
...
@@ -38,7 +38,7 @@ else()
FetchContent_Declare
(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG
2adfc8c2177c5b0e8ddeedfd5a8990d80eb496ff
GIT_TAG
188be16520ceefdc625fdf71365585d2ee348fe2
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR
${
CMAKE_BINARY_DIR
}
/vllm-flash-attn
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
e3bf79ff
...
...
@@ -308,15 +308,10 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
)
self
.
max_cudagraph_size
=
self
.
compilation_config
.
max_cudagraph_capture_size
max_num_seqs
=
vllm_config
.
scheduler_config
.
max_num_seqs
if
self
.
use_full_cuda_graph
and
self
.
aot_schedule
:
# Times 4 due to:
# https://github.com/vllm-project/flash-attention/blob/3223650ccabe622a0fcae65eec706a50186a89f7/hopper/flash_api.cpp#L650-L653
# For some tests max_cudagraph_size > max_num_seqs,
# so we need to use the larger one.
self
.
scheduler_metadata
=
torch
.
zeros
(
max
(
self
.
max_cudagraph_size
or
0
,
max_num_seqs
)
*
4
+
1
,
vllm_config
.
scheduler_config
.
max_num_seqs
+
1
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
)
...
...
vllm/v1/attention/backends/mla/flashattn_mla.py
View file @
e3bf79ff
...
...
@@ -127,15 +127,10 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
self
.
compilation_config
.
cudagraph_mode
.
has_full_cudagraphs
()
)
self
.
max_cudagraph_size
=
self
.
compilation_config
.
max_cudagraph_capture_size
max_num_seqs
=
vllm_config
.
scheduler_config
.
max_num_seqs
if
self
.
use_full_cuda_graph
and
self
.
fa_aot_schedule
:
# Times 4 due to:
# https://github.com/vllm-project/flash-attention/blob/3223650ccabe622a0fcae65eec706a50186a89f7/hopper/flash_api.cpp#L650-L653
# For some tests max_cudagraph_size > max_num_seqs,
# so we need to use the larger one.
self
.
scheduler_metadata
=
torch
.
zeros
(
max
(
self
.
max_cudagraph_size
or
0
,
max_num_seqs
)
*
4
+
1
,
vllm_config
.
scheduler_config
.
max_num_seqs
+
1
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment