Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
866fa455
Unverified
Commit
866fa455
authored
Dec 18, 2024
by
Konrad Zawora
Committed by
GitHub
Dec 17, 2024
Browse files
[Bugfix] Restore support for larger block sizes (#11259)
Signed-off-by:
Konrad Zawora
<
kzawora@habana.ai
>
parent
bf8717eb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
2 deletions
+8
-2
vllm/config.py
vllm/config.py
+4
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+4
-2
No files found.
vllm/config.py
View file @
866fa455
...
...
@@ -917,6 +917,10 @@ class CacheConfig:
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
f
"
{
self
.
gpu_memory_utilization
}
."
)
if
(
current_platform
.
is_cuda
()
and
self
.
block_size
is
not
None
and
self
.
block_size
>
32
):
raise
ValueError
(
"CUDA Paged Attention kernel only supports "
f
"block sizes up to 32. Got
{
self
.
block_size
}
."
)
def
_verify_cache_dtype
(
self
)
->
None
:
if
self
.
cache_dtype
==
"auto"
:
...
...
vllm/engine/arg_utils.py
View file @
866fa455
...
...
@@ -424,10 +424,12 @@ class EngineArgs:
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
choices
=
[
8
,
16
,
32
,
64
,
128
],
help
=
'Token block size for contiguous chunks of '
'tokens. This is ignored on neuron devices and '
'set to max-model-len'
)
'set to max-model-len. On CUDA devices, '
'only block sizes up to 32 are supported. '
'On HPU devices, block size defaults to 128.'
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment