Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
88dbf92c
Commit
88dbf92c
authored
Aug 06, 2025
by
zhuwenwen
Browse files
update VLLM_FLASH_ATTN_V1 to VLLM_USE_FLASH_ATTN_PA
parent
fe657b8b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
4 additions
and
10 deletions
+4
-10
vllm/attention/layer.py
vllm/attention/layer.py
+2
-2
vllm/config.py
vllm/config.py
+1
-1
vllm/envs.py
vllm/envs.py
+0
-6
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-1
No files found.
vllm/attention/layer.py
View file @
88dbf92c
...
@@ -75,7 +75,7 @@ class Attention(nn.Module):
...
@@ -75,7 +75,7 @@ class Attention(nn.Module):
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
else
:
else
:
kv_cache_dtype
=
"auto"
kv_cache_dtype
=
"auto"
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
or
not
envs
.
VLLM_FLASH_ATTN_V1
else
64
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
else
64
is_attention_free
=
False
is_attention_free
=
False
calculate_kv_scales
=
False
calculate_kv_scales
=
False
if
num_kv_heads
is
None
:
if
num_kv_heads
is
None
:
...
@@ -303,7 +303,7 @@ class MultiHeadAttention(nn.Module):
...
@@ -303,7 +303,7 @@ class MultiHeadAttention(nn.Module):
attn_backend
=
get_attn_backend
(
head_size
,
attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
dtype
,
kv_cache_dtype
=
None
,
kv_cache_dtype
=
None
,
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
or
not
envs
.
VLLM_FLASH_ATTN_V1
else
64
,
block_size
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
else
64
,
is_attention_free
=
False
)
is_attention_free
=
False
)
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
...
...
vllm/config.py
View file @
88dbf92c
...
@@ -1497,7 +1497,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
...
@@ -1497,7 +1497,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class
CacheConfig
:
class
CacheConfig
:
"""Configuration for the KV cache."""
"""Configuration for the KV cache."""
block_size
:
BlockSize
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
or
not
envs
.
VLLM_FLASH_ATTN_V1
else
64
# type: ignore
block_size
:
BlockSize
=
16
if
not
envs
.
VLLM_USE_FLASH_ATTN_PA
else
64
# type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
...
...
vllm/envs.py
View file @
88dbf92c
...
@@ -154,7 +154,6 @@ if TYPE_CHECKING:
...
@@ -154,7 +154,6 @@ if TYPE_CHECKING:
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX
:
int
=
16
VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX
:
int
=
16
VLLM_ENFORCE_EAGER_BS_THRESHOLD
:
Optional
[
int
]
=
None
VLLM_ENFORCE_EAGER_BS_THRESHOLD
:
Optional
[
int
]
=
None
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_FLASH_ATTN_V1
:
bool
=
False
VLLM_USE_NN
:
bool
=
False
VLLM_USE_NN
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_TBO_REQ_DELAY_MS
:
int
=
0
VLLM_TBO_REQ_DELAY_MS
:
int
=
0
...
@@ -1047,11 +1046,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1047,11 +1046,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# to restore the default usage.
# to restore the default usage.
"VLLM_HAS_CONTEXT_DEFAULT"
:
"VLLM_HAS_CONTEXT_DEFAULT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_HAS_CONTEXT_DEFAULT"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_HAS_CONTEXT_DEFAULT"
,
"0"
))),
# If set, vLLM will use FlashAttention Backend for v1 attention computation on rocm
"VLLM_FLASH_ATTN_V1"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_FLASH_ATTN_V1"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set, vLLM will transpose weight to use nn layout
# If set, vLLM will transpose weight to use nn layout
"VLLM_USE_NN"
:
"VLLM_USE_NN"
:
...
...
vllm/platforms/rocm.py
View file @
88dbf92c
...
@@ -275,7 +275,7 @@ class RocmPlatform(Platform):
...
@@ -275,7 +275,7 @@ class RocmPlatform(Platform):
# logger.info_once("Using Triton backend on V1 engine.")
# logger.info_once("Using Triton backend on V1 engine.")
# return TRITON_ATTN_VLLM_V1
# return TRITON_ATTN_VLLM_V1
if
envs
.
VLLM_FLASH_ATTN_V1
and
block_size
==
64
:
if
envs
.
is_set
(
"VLLM_USE_FLASH_ATTN_PA"
)
and
envs
.
VLLM_USE_FLASH_ATTN_PA
and
block_size
==
64
:
logger
.
info_once
(
"Using Flash Attention backend on V1 engine. (only supports block size 64)"
)
logger
.
info_once
(
"Using Flash Attention backend on V1 engine. (only supports block size 64)"
)
return
FLASH_ATTN_V1
return
FLASH_ATTN_V1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment