Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ca0fea07
Commit
ca0fea07
authored
Oct 11, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev' of
ssh://10.16.6.30:10022/dcutoolkit/deeplearing/vllm
into v0.9.2-dev
parents
bfc220c7
3daae57c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
3 deletions
+3
-3
vllm/attention/layer.py
vllm/attention/layer.py
+2
-2
vllm/config.py
vllm/config.py
+1
-1
No files found.
vllm/attention/layer.py
View file @
ca0fea07
...
...
@@ -76,7 +76,7 @@ class Attention(nn.Module):
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
else
:
kv_cache_dtype
=
"auto"
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
is_attention_free
=
False
calculate_kv_scales
=
False
if
num_kv_heads
is
None
:
...
...
@@ -305,7 +305,7 @@ class MultiHeadAttention(nn.Module):
attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
kv_cache_dtype
=
None
,
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
,
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
,
is_attention_free
=
False
)
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
current_platform
.
is_rocm
():
...
...
vllm/config.py
View file @
ca0fea07
...
...
@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class
CacheConfig
:
"""Configuration for the KV cache."""
block_size
:
BlockSize
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
# type: ignore
block_size
:
BlockSize
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
# type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment