Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
f936657e
Unverified
Commit
f936657e
authored
Sep 28, 2023
by
Woosuk Kwon
Committed by
GitHub
Sep 28, 2023
Browse files
Provide default max model length (#1224)
parent
6f88f762
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
14 additions
and
9 deletions
+14
-9
vllm/config.py
vllm/config.py
+11
-7
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+1
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-1
No files found.
vllm/config.py
View file @
f936657e
...
@@ -164,9 +164,6 @@ class ModelConfig:
...
@@ -164,9 +164,6 @@ class ModelConfig:
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
return
total_num_attention_heads
//
parallel_config
.
tensor_parallel_size
return
total_num_attention_heads
//
parallel_config
.
tensor_parallel_size
def
get_max_model_len
(
self
)
->
int
:
return
self
.
max_model_len
def
get_num_layers
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
def
get_num_layers
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
return
total_num_hidden_layers
//
parallel_config
.
pipeline_parallel_size
return
total_num_hidden_layers
//
parallel_config
.
pipeline_parallel_size
...
@@ -378,10 +375,17 @@ def _get_and_verify_max_len(
...
@@ -378,10 +375,17 @@ def _get_and_verify_max_len(
if
max_len_key
is
not
None
:
if
max_len_key
is
not
None
:
derived_max_model_len
=
min
(
derived_max_model_len
,
max_len_key
)
derived_max_model_len
=
min
(
derived_max_model_len
,
max_len_key
)
if
derived_max_model_len
==
float
(
"inf"
):
if
derived_max_model_len
==
float
(
"inf"
):
raise
ValueError
(
if
max_model_len
is
not
None
:
"The model's config.json must contain one of the following keys "
# If max_model_len is specified, we use it.
"to determine the original maximum length of the model: "
return
max_model_len
f
"
{
possible_keys
}
"
)
default_max_len
=
2048
logger
.
warning
(
"The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f
"
{
possible_keys
}
. Assuming the model's maximum length is "
f
"
{
default_max_len
}
."
)
derived_max_model_len
=
default_max_len
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
if
rope_scaling
is
not
None
:
if
rope_scaling
is
not
None
:
...
...
vllm/engine/arg_utils.py
View file @
f936657e
...
@@ -184,7 +184,7 @@ class EngineArgs:
...
@@ -184,7 +184,7 @@ class EngineArgs:
self
.
worker_use_ray
)
self
.
worker_use_ray
)
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
self
.
max_num_seqs
,
self
.
max_num_seqs
,
model_config
.
get_
max_model_len
()
)
model_config
.
max_model_len
)
return
model_config
,
cache_config
,
parallel_config
,
scheduler_config
return
model_config
,
cache_config
,
parallel_config
,
scheduler_config
...
...
vllm/engine/llm_engine.py
View file @
f936657e
...
@@ -77,6 +77,7 @@ class LLMEngine:
...
@@ -77,6 +77,7 @@ class LLMEngine:
f
"revision=
{
model_config
.
revision
}
, "
f
"revision=
{
model_config
.
revision
}
, "
f
"trust_remote_code=
{
model_config
.
trust_remote_code
}
, "
f
"trust_remote_code=
{
model_config
.
trust_remote_code
}
, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"max_seq_len=
{
model_config
.
max_model_len
}
, "
f
"download_dir=
{
model_config
.
download_dir
!
r
}
, "
f
"download_dir=
{
model_config
.
download_dir
!
r
}
, "
f
"load_format=
{
model_config
.
load_format
}
, "
f
"load_format=
{
model_config
.
load_format
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
...
...
vllm/entrypoints/openai/api_server.py
View file @
f936657e
...
@@ -615,7 +615,7 @@ if __name__ == "__main__":
...
@@ -615,7 +615,7 @@ if __name__ == "__main__":
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
max_model_len
=
engine_model_config
.
get_
max_model_len
()
max_model_len
=
engine_model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
# A separate tokenizer to map token IDs to strings.
tokenizer
=
get_tokenizer
(
engine_args
.
tokenizer
,
tokenizer
=
get_tokenizer
(
engine_args
.
tokenizer
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment