Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b61f7a69
Commit
b61f7a69
authored
Jul 17, 2025
by
zhuwenwen
Browse files
set default block_size and pa
parent
1092a467
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
3 deletions
+3
-3
vllm/attention/layer.py
vllm/attention/layer.py
+2
-2
vllm/envs.py
vllm/envs.py
+1
-1
No files found.
vllm/attention/layer.py
View file @
b61f7a69
...
@@ -75,7 +75,7 @@ class Attention(nn.Module):
...
@@ -75,7 +75,7 @@ class Attention(nn.Module):
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
else
:
else
:
kv_cache_dtype
=
"auto"
kv_cache_dtype
=
"auto"
block_size
=
1
6
block_size
=
6
4
is_attention_free
=
False
is_attention_free
=
False
calculate_kv_scales
=
False
calculate_kv_scales
=
False
if
num_kv_heads
is
None
:
if
num_kv_heads
is
None
:
...
@@ -298,7 +298,7 @@ class MultiHeadAttention(nn.Module):
...
@@ -298,7 +298,7 @@ class MultiHeadAttention(nn.Module):
attn_backend
=
get_attn_backend
(
head_size
,
attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
dtype
,
kv_cache_dtype
=
None
,
kv_cache_dtype
=
None
,
block_size
=
1
6
,
block_size
=
6
4
,
is_attention_free
=
False
)
is_attention_free
=
False
)
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
backend
in
{
_Backend
.
FLASH_ATTN
,
_Backend
.
FLASH_ATTN_VLLM_V1
}:
if
backend
in
{
_Backend
.
FLASH_ATTN
,
_Backend
.
FLASH_ATTN_VLLM_V1
}:
...
...
vllm/envs.py
View file @
b61f7a69
...
@@ -995,7 +995,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -995,7 +995,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vLLM will use FlashAttention Backend for page attention computation on rocm
# vLLM will use FlashAttention Backend for page attention computation on rocm
"VLLM_USE_FLASH_ATTN_PA"
:
"VLLM_USE_FLASH_ATTN_PA"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_FLASH_ATTN_PA"
,
"
Fals
e"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_FLASH_ATTN_PA"
,
"
Tru
e"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# vLLM will use apex for rmsnorm
# vLLM will use apex for rmsnorm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment