Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
88dbf92c
Commit
88dbf92c
authored
Aug 06, 2025
by
zhuwenwen
Browse files
update VLLM_FLASH_ATTN_V1 to VLLM_USE_FLASH_ATTN_PA
parent
fe657b8b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
4 additions
and
10 deletions
+4
-10
vllm/attention/layer.py
vllm/attention/layer.py
+2
-2
vllm/config.py
vllm/config.py
+1
-1
vllm/envs.py
vllm/envs.py
+0
-6
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-1
No files found.
vllm/attention/layer.py
View file @
88dbf92c
...
...
@@ -75,7 +75,7 @@ class Attention(nn.Module):
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
else
:
kv_cache_dtype
=
"auto"
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
or
not
envs
.
VLLM_FLASH_ATTN_V1
else
64
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
else
64
is_attention_free
=
False
calculate_kv_scales
=
False
if
num_kv_heads
is
None
:
...
...
@@ -303,7 +303,7 @@ class MultiHeadAttention(nn.Module):
attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
kv_cache_dtype
=
None
,
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
or
not
envs
.
VLLM_FLASH_ATTN_V1
else
64
,
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
else
64
,
is_attention_free
=
False
)
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
current_platform
.
is_rocm
():
...
...
vllm/config.py
View file @
88dbf92c
...
...
@@ -1497,7 +1497,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class
CacheConfig
:
"""Configuration for the KV cache."""
block_size
:
BlockSize
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
or
not
envs
.
VLLM_FLASH_ATTN_V1
else
64
# type: ignore
block_size
:
BlockSize
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
else
64
# type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
...
...
vllm/envs.py
View file @
88dbf92c
...
...
@@ -154,7 +154,6 @@ if TYPE_CHECKING:
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX
:
int
=
16
VLLM_ENFORCE_EAGER_BS_THRESHOLD
:
Optional
[
int
]
=
None
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_FLASH_ATTN_V1
:
bool
=
False
VLLM_USE_NN
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_TBO_REQ_DELAY_MS
:
int
=
0
...
...
@@ -1047,11 +1046,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# to restore the default usage.
"VLLM_HAS_CONTEXT_DEFAULT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_HAS_CONTEXT_DEFAULT"
,
"0"
))),
# If set, vLLM will use FlashAttention Backend for v1 attention computation on rocm
"VLLM_FLASH_ATTN_V1"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_FLASH_ATTN_V1"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set, vLLM will transpose weight to use nn layout
"VLLM_USE_NN"
:
...
...
vllm/platforms/rocm.py
View file @
88dbf92c
...
...
@@ -275,7 +275,7 @@ class RocmPlatform(Platform):
# logger.info_once("Using Triton backend on V1 engine.")
# return TRITON_ATTN_VLLM_V1
if
envs
.
VLLM_FLASH_ATTN_V1
and
block_size
==
64
:
if
envs
.
is_set
(
"VLLM_USE_FLASH_ATTN_PA"
)
and
envs
.
VLLM_USE_FLASH_ATTN_PA
and
block_size
==
64
:
logger
.
info_once
(
"Using Flash Attention backend on V1 engine. (only supports block size 64)"
)
return
FLASH_ATTN_V1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment