Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1f0ae3ed
Unverified
Commit
1f0ae3ed
authored
Feb 24, 2025
by
Robert Shaw
Committed by
GitHub
Feb 24, 2025
Browse files
[Misc] Clean Up `EngineArgs.create_engine_config` (#13734)
Signed-off-by:
rshaw@neuralmagic.com
<
rshaw@neuralmagic.com
>
parent
db986c19
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
40 deletions
+29
-40
vllm/config.py
vllm/config.py
+4
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+25
-40
No files found.
vllm/config.py
View file @
1f0ae3ed
...
@@ -1124,6 +1124,10 @@ class CacheConfig:
...
@@ -1124,6 +1124,10 @@ class CacheConfig:
return
{
key
:
str
(
value
)
for
key
,
value
in
self
.
__dict__
.
items
()}
return
{
key
:
str
(
value
)
for
key
,
value
in
self
.
__dict__
.
items
()}
def
_verify_args
(
self
)
->
None
:
def
_verify_args
(
self
)
->
None
:
if
self
.
cpu_offload_gb
<
0
:
raise
ValueError
(
"CPU offload space must be non-negative"
f
", but got
{
self
.
cpu_offload_gb
}
"
)
if
self
.
gpu_memory_utilization
>
1.0
:
if
self
.
gpu_memory_utilization
>
1.0
:
raise
ValueError
(
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
"GPU memory utilization must be less than 1.0. Got "
...
...
vllm/engine/arg_utils.py
View file @
1f0ae3ed
...
@@ -1062,6 +1062,17 @@ class EngineArgs:
...
@@ -1062,6 +1062,17 @@ class EngineArgs:
return
engine_args
return
engine_args
def
create_model_config
(
self
)
->
ModelConfig
:
def
create_model_config
(
self
)
->
ModelConfig
:
# gguf file needs a specific model loader and doesn't use hf_repo
if
check_gguf_file
(
self
.
model
):
self
.
quantization
=
self
.
load_format
=
"gguf"
# NOTE: This is to allow model loading from S3 in CI
if
(
not
isinstance
(
self
,
AsyncEngineArgs
)
and
envs
.
VLLM_CI_USE_S3
and
self
.
model
in
MODELS_ON_S3
and
self
.
load_format
==
LoadFormat
.
AUTO
):
# noqa: E501
self
.
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
self
.
model
}
"
self
.
load_format
=
LoadFormat
.
RUNAI_STREAMER
return
ModelConfig
(
return
ModelConfig
(
model
=
self
.
model
,
model
=
self
.
model
,
task
=
self
.
task
,
task
=
self
.
task
,
...
@@ -1101,26 +1112,6 @@ class EngineArgs:
...
@@ -1101,26 +1112,6 @@ class EngineArgs:
)
)
def
create_load_config
(
self
)
->
LoadConfig
:
def
create_load_config
(
self
)
->
LoadConfig
:
return
LoadConfig
(
load_format
=
self
.
load_format
,
download_dir
=
self
.
download_dir
,
model_loader_extra_config
=
self
.
model_loader_extra_config
,
ignore_patterns
=
self
.
ignore_patterns
,
)
def
create_engine_config
(
self
,
usage_context
:
Optional
[
UsageContext
]
=
None
)
->
VllmConfig
:
from
vllm.platforms
import
current_platform
current_platform
.
pre_register_and_update
()
if
envs
.
VLLM_USE_V1
:
self
.
_override_v1_engine_args
(
usage_context
)
# gguf file needs a specific model loader and doesn't use hf_repo
if
check_gguf_file
(
self
.
model
):
self
.
quantization
=
self
.
load_format
=
"gguf"
# bitsandbytes quantization needs a specific model loader
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
# so we make sure the quant method and the load format are consistent
if
(
self
.
quantization
==
"bitsandbytes"
or
if
(
self
.
quantization
==
"bitsandbytes"
or
...
@@ -1137,19 +1128,23 @@ class EngineArgs:
...
@@ -1137,19 +1128,23 @@ class EngineArgs:
"BitsAndBytes load format and QLoRA adapter only support "
"BitsAndBytes load format and QLoRA adapter only support "
f
"'bitsandbytes' quantization, but got
{
self
.
quantization
}
"
)
f
"'bitsandbytes' quantization, but got
{
self
.
quantization
}
"
)
assert
self
.
cpu_offload_gb
>=
0
,
(
return
LoadConfig
(
"CPU offload space must be non-negative"
load_format
=
self
.
load_format
,
f
", but got
{
self
.
cpu_offload_gb
}
"
)
download_dir
=
self
.
download_dir
,
model_loader_extra_config
=
self
.
model_loader_extra_config
,
ignore_patterns
=
self
.
ignore_patterns
,
)
device_config
=
DeviceConfig
(
device
=
self
.
device
)
def
create_engine_config
(
self
,
usage_context
:
Optional
[
UsageContext
]
=
None
)
->
VllmConfig
:
from
vllm.platforms
import
current_platform
current_platform
.
pre_register_and_update
()
# NOTE: This is to allow model loading from S3 in CI
if
envs
.
VLLM_USE_V1
:
if
(
not
isinstance
(
self
,
AsyncEngineArgs
)
and
envs
.
VLLM_CI_USE_S3
self
.
_override_v1_engine_args
(
usage_context
)
and
self
.
model
in
MODELS_ON_S3
and
self
.
load_format
==
LoadFormat
.
AUTO
):
# noqa: E501
self
.
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
self
.
model
}
"
self
.
load_format
=
LoadFormat
.
RUNAI_STREAMER
device_config
=
DeviceConfig
(
device
=
self
.
device
)
model_config
=
self
.
create_model_config
()
model_config
=
self
.
create_model_config
()
if
(
model_config
.
is_multimodal_model
and
not
envs
.
VLLM_USE_V1
if
(
model_config
.
is_multimodal_model
and
not
envs
.
VLLM_USE_V1
...
@@ -1281,16 +1276,6 @@ class EngineArgs:
...
@@ -1281,16 +1276,6 @@ class EngineArgs:
if
speculative_config
is
None
\
if
speculative_config
is
None
\
else
speculative_config
.
num_lookahead_slots
else
speculative_config
.
num_lookahead_slots
if
not
self
.
use_v2_block_manager
:
logger
.
warning
(
"[DEPRECATED] Block manager v1 has been removed, "
"and setting --use-v2-block-manager to True or False has "
"no effect on vLLM behavior. Please remove "
"--use-v2-block-manager in your engine argument. "
"If your use case is not supported by "
"SelfAttnBlockSpaceManager (i.e. block manager v2),"
" please file an issue with detailed information."
)
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
runner_type
=
model_config
.
runner_type
,
runner_type
=
model_config
.
runner_type
,
max_num_batched_tokens
=
self
.
max_num_batched_tokens
,
max_num_batched_tokens
=
self
.
max_num_batched_tokens
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment