Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
134810b3
Unverified
Commit
134810b3
authored
Dec 10, 2024
by
Woosuk Kwon
Committed by
GitHub
Dec 10, 2024
Browse files
[V1][Bugfix] Always set enable_chunked_prefill = True for V1 (#11061)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
75f89dc4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
9 deletions
+12
-9
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+12
-9
No files found.
vllm/engine/arg_utils.py
View file @
134810b3
...
...
@@ -122,7 +122,7 @@ class EngineArgs:
cpu_offload_gb
:
float
=
0
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
max_num_seqs
:
Optional
[
int
]
=
None
max_logprobs
:
int
=
20
# Default value for OpenAI Chat Completions API
disable_log_stats
:
bool
=
False
revision
:
Optional
[
str
]
=
None
...
...
@@ -205,6 +205,9 @@ class EngineArgs:
# by user.
if
self
.
enable_prefix_caching
is
None
:
self
.
enable_prefix_caching
=
bool
(
envs
.
VLLM_USE_V1
)
# Override max_num_seqs if it's not set by user.
if
self
.
max_num_seqs
is
None
:
self
.
max_num_seqs
=
256
if
not
envs
.
VLLM_USE_V1
else
1024
# support `EngineArgs(compilation_config={...})`
# without having to manually construct a
...
...
@@ -1225,19 +1228,19 @@ class EngineArgs:
"""
assert
envs
.
VLLM_USE_V1
,
"V1 is not enabled"
# V1 always uses chunked prefills.
self
.
enable_chunked_prefill
=
True
# When no user override, set the default values based on the usage
# context.
# TODO(woosuk): Tune the default values for different hardware.
if
self
.
max_num_batched_tokens
is
None
:
# When no user override, set the default values based on the
# usage context.
if
usage_context
==
UsageContext
.
LLM_CLASS
:
logger
.
warning
(
"Setting max_num_batched_tokens to 8192 "
"for LLM_CLASS usage context."
)
self
.
max_num_seqs
=
1024
self
.
max_num_batched_tokens
=
8192
elif
usage_context
==
UsageContext
.
OPENAI_API_SERVER
:
logger
.
warning
(
"Setting max_num_batched_tokens to 2048 "
"for OPENAI_API_SERVER usage context."
)
self
.
max_num_seqs
=
1024
self
.
max_num_batched_tokens
=
2048
logger
.
warning
(
"Setting max_num_batched_tokens to %d for %s usage context."
,
self
.
max_num_batched_tokens
,
usage_context
.
value
)
def
_override_v1_engine_config
(
self
,
engine_config
:
VllmConfig
)
->
None
:
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment