Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
SIYIXNI
vllm
Commits
f936657e
"...composable_kernel.git" did not exist on "473ba5bc4aa465e10084ffd8caa077fee9f69e9b"
Unverified
Commit
f936657e
authored
Sep 28, 2023
by
Woosuk Kwon
Committed by
GitHub
Sep 28, 2023
Browse files
Provide default max model length (#1224)
parent
6f88f762
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
14 additions
and
9 deletions
+14
-9
vllm/config.py
vllm/config.py
+11
-7
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+1
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-1
No files found.
vllm/config.py
View file @
f936657e
...
@@ -164,9 +164,6 @@ class ModelConfig:
...
@@ -164,9 +164,6 @@ class ModelConfig:
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
return
total_num_attention_heads
//
parallel_config
.
tensor_parallel_size
return
total_num_attention_heads
//
parallel_config
.
tensor_parallel_size
def
get_max_model_len
(
self
)
->
int
:
return
self
.
max_model_len
def
get_num_layers
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
def
get_num_layers
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
return
total_num_hidden_layers
//
parallel_config
.
pipeline_parallel_size
return
total_num_hidden_layers
//
parallel_config
.
pipeline_parallel_size
...
@@ -378,10 +375,17 @@ def _get_and_verify_max_len(
...
@@ -378,10 +375,17 @@ def _get_and_verify_max_len(
if
max_len_key
is
not
None
:
if
max_len_key
is
not
None
:
derived_max_model_len
=
min
(
derived_max_model_len
,
max_len_key
)
derived_max_model_len
=
min
(
derived_max_model_len
,
max_len_key
)
if
derived_max_model_len
==
float
(
"inf"
):
if
derived_max_model_len
==
float
(
"inf"
):
raise
ValueError
(
if
max_model_len
is
not
None
:
"The model's config.json must contain one of the following keys "
# If max_model_len is specified, we use it.
"to determine the original maximum length of the model: "
return
max_model_len
f
"
{
possible_keys
}
"
)
default_max_len
=
2048
logger
.
warning
(
"The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f
"
{
possible_keys
}
. Assuming the model's maximum length is "
f
"
{
default_max_len
}
."
)
derived_max_model_len
=
default_max_len
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
if
rope_scaling
is
not
None
:
if
rope_scaling
is
not
None
:
...
...
vllm/engine/arg_utils.py
View file @
f936657e
...
@@ -184,7 +184,7 @@ class EngineArgs:
...
@@ -184,7 +184,7 @@ class EngineArgs:
self
.
worker_use_ray
)
self
.
worker_use_ray
)
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
self
.
max_num_seqs
,
self
.
max_num_seqs
,
model_config
.
get_
max_model_len
()
)
model_config
.
max_model_len
)
return
model_config
,
cache_config
,
parallel_config
,
scheduler_config
return
model_config
,
cache_config
,
parallel_config
,
scheduler_config
...
...
vllm/engine/llm_engine.py
View file @
f936657e
...
@@ -77,6 +77,7 @@ class LLMEngine:
...
@@ -77,6 +77,7 @@ class LLMEngine:
f
"revision=
{
model_config
.
revision
}
, "
f
"revision=
{
model_config
.
revision
}
, "
f
"trust_remote_code=
{
model_config
.
trust_remote_code
}
, "
f
"trust_remote_code=
{
model_config
.
trust_remote_code
}
, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"max_seq_len=
{
model_config
.
max_model_len
}
, "
f
"download_dir=
{
model_config
.
download_dir
!
r
}
, "
f
"download_dir=
{
model_config
.
download_dir
!
r
}
, "
f
"load_format=
{
model_config
.
load_format
}
, "
f
"load_format=
{
model_config
.
load_format
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
...
...
vllm/entrypoints/openai/api_server.py
View file @
f936657e
...
@@ -615,7 +615,7 @@ if __name__ == "__main__":
...
@@ -615,7 +615,7 @@ if __name__ == "__main__":
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
max_model_len
=
engine_model_config
.
get_
max_model_len
()
max_model_len
=
engine_model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
# A separate tokenizer to map token IDs to strings.
tokenizer
=
get_tokenizer
(
engine_args
.
tokenizer
,
tokenizer
=
get_tokenizer
(
engine_args
.
tokenizer
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment