Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f12b20de
Unverified
Commit
f12b20de
authored
May 09, 2024
by
Cyrus Leung
Committed by
GitHub
May 08, 2024
Browse files
[Frontend] Move async logic outside of constructor (#4674)
parent
16bc0a09
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
96 additions
and
102 deletions
+96
-102
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+13
-17
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+6
-2
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+20
-3
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+35
-37
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+4
-3
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+17
-39
No files found.
tests/async_engine/test_chat_template.py
View file @
f12b20de
...
@@ -60,13 +60,12 @@ class MockServingChat:
...
@@ -60,13 +60,12 @@ class MockServingChat:
tokenizer
:
MockTokenizer
tokenizer
:
MockTokenizer
@
pytest
.
mark
.
asyncio
def
test_load_chat_template
():
async
def
test_load_chat_template
():
# Testing chatml template
# Testing chatml template
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
await
OpenAIServingChat
.
_load_chat_template
(
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
chat_template
=
chatml_jinja_path
)
template_content
=
tokenizer
.
chat_template
template_content
=
tokenizer
.
chat_template
...
@@ -77,8 +76,7 @@ async def test_load_chat_template():
...
@@ -77,8 +76,7 @@ async def test_load_chat_template():
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant
\\
n' }}{% endif %}"""
# noqa: E501
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant
\\
n' }}{% endif %}"""
# noqa: E501
@
pytest
.
mark
.
asyncio
def
test_no_load_chat_template_filelike
():
async
def
test_no_load_chat_template_filelike
():
# Testing chatml template
# Testing chatml template
template
=
"../../examples/does_not_exist"
template
=
"../../examples/does_not_exist"
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
...
@@ -86,34 +84,32 @@ async def test_no_load_chat_template_filelike():
...
@@ -86,34 +84,32 @@ async def test_no_load_chat_template_filelike():
mock_serving_chat
=
MockServingChat
(
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
with
pytest
.
raises
(
ValueError
,
match
=
"looks like a file path"
):
with
pytest
.
raises
(
ValueError
,
match
=
"looks like a file path"
):
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
chat_template
=
template
)
@
pytest
.
mark
.
asyncio
def
test_no_load_chat_template_literallike
():
async
def
test_no_load_chat_template_literallike
():
# Testing chatml template
# Testing chatml template
template
=
"{{ messages }}"
template
=
"{{ messages }}"
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
template_content
=
tokenizer
.
chat_template
assert
template_content
==
template
assert
template_content
==
template
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,expected_output"
,
"model,template,add_generation_prompt,expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
MODEL_TEMPLATE_GENERATON_OUTPUT
)
async
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
expected_output
):
# Initialize the tokenizer
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
chat_template
=
template
)
# Create a mock request object using keyword arguments
# Create a mock request object using keyword arguments
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
f12b20de
...
@@ -20,11 +20,15 @@ class MockModelConfig:
...
@@ -20,11 +20,15 @@ class MockModelConfig:
class
MockEngine
:
class
MockEngine
:
async
def
get_model_config
(
self
):
async
def
get_model_config
(
self
):
return
MockModelConfig
return
MockModelConfig
()
async
def
_async_serving_chat_init
():
async
def
_async_serving_chat_init
():
serving_completion
=
OpenAIServingChat
(
MockEngine
(),
engine
=
MockEngine
()
model_config
=
await
engine
.
get_model_config
()
serving_completion
=
OpenAIServingChat
(
engine
,
model_config
,
served_model_names
=
[
MODEL_NAME
],
served_model_names
=
[
MODEL_NAME
],
response_role
=
"assistant"
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
)
chat_template
=
CHAT_TEMPLATE
)
...
...
vllm/engine/arg_utils.py
View file @
f12b20de
...
@@ -516,7 +516,7 @@ class EngineArgs:
...
@@ -516,7 +516,7 @@ class EngineArgs:
return
parser
return
parser
@
classmethod
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
)
->
'EngineArgs'
:
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
# Get the list of attributes of this dataclass.
# Get the list of attributes of this dataclass.
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
# Set the attributes from the parsed arguments.
# Set the attributes from the parsed arguments.
...
...
vllm/entrypoints/openai/api_server.py
View file @
f12b20de
...
@@ -4,7 +4,7 @@ import inspect
...
@@ -4,7 +4,7 @@ import inspect
import
re
import
re
from
contextlib
import
asynccontextmanager
from
contextlib
import
asynccontextmanager
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Set
from
typing
import
Optional
,
Set
import
fastapi
import
fastapi
import
uvicorn
import
uvicorn
...
@@ -164,15 +164,32 @@ if __name__ == "__main__":
...
@@ -164,15 +164,32 @@ if __name__ == "__main__":
served_model_names
=
args
.
served_model_name
served_model_names
=
args
.
served_model_name
else
:
else
:
served_model_names
=
[
args
.
model
]
served_model_names
=
[
args
.
model
]
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
,
usage_context
=
UsageContext
.
OPENAI_API_SERVER
)
engine_args
,
usage_context
=
UsageContext
.
OPENAI_API_SERVER
)
openai_serving_chat
=
OpenAIServingChat
(
engine
,
served_model_names
,
event_loop
:
Optional
[
asyncio
.
AbstractEventLoop
]
try
:
event_loop
=
asyncio
.
get_running_loop
()
except
RuntimeError
:
event_loop
=
None
if
event_loop
is
not
None
and
event_loop
.
is_running
():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config
=
event_loop
.
run_until_complete
(
engine
.
get_model_config
())
else
:
# When using single vLLM without engine_use_ray
model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
openai_serving_chat
=
OpenAIServingChat
(
engine
,
model_config
,
served_model_names
,
args
.
response_role
,
args
.
response_role
,
args
.
lora_modules
,
args
.
lora_modules
,
args
.
chat_template
)
args
.
chat_template
)
openai_serving_completion
=
OpenAIServingCompletion
(
openai_serving_completion
=
OpenAIServingCompletion
(
engine
,
served_model_names
,
args
.
lora_modules
)
engine
,
model_config
,
served_model_names
,
args
.
lora_modules
)
app
.
root_path
=
args
.
root_path
app
.
root_path
=
args
.
root_path
uvicorn
.
run
(
app
,
uvicorn
.
run
(
app
,
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
f12b20de
import
asyncio
import
codecs
import
codecs
import
time
import
time
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Awaitable
,
Iterable
,
List
,
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Awaitable
,
Iterable
,
List
,
...
@@ -8,6 +7,7 @@ from fastapi import Request
...
@@ -8,6 +7,7 @@ from fastapi import Request
from
openai.types.chat
import
(
ChatCompletionContentPartParam
,
from
openai.types.chat
import
(
ChatCompletionContentPartParam
,
ChatCompletionRole
)
ChatCompletionRole
)
from
vllm.config
import
ModelConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionRequest
,
ChatCompletionResponse
,
...
@@ -35,17 +35,47 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -35,17 +35,47 @@ class OpenAIServingChat(OpenAIServing):
def
__init__
(
self
,
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
served_model_names
:
List
[
str
],
served_model_names
:
List
[
str
],
response_role
:
str
,
response_role
:
str
,
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]
=
None
,
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]
=
None
,
chat_template
:
Optional
[
str
]
=
None
):
chat_template
:
Optional
[
str
]
=
None
):
super
().
__init__
(
engine
=
engine
,
super
().
__init__
(
engine
=
engine
,
model_config
=
model_config
,
served_model_names
=
served_model_names
,
served_model_names
=
served_model_names
,
lora_modules
=
lora_modules
,
lora_modules
=
lora_modules
)
await_post_init
=
self
.
_load_chat_template
(
chat_template
=
chat_template
))
self
.
response_role
=
response_role
self
.
response_role
=
response_role
self
.
_load_chat_template
(
chat_template
)
def
_load_chat_template
(
self
,
chat_template
:
Optional
[
str
]):
tokenizer
=
self
.
tokenizer
if
chat_template
is
not
None
:
try
:
with
open
(
chat_template
,
"r"
)
as
f
:
tokenizer
.
chat_template
=
f
.
read
()
except
OSError
as
e
:
JINJA_CHARS
=
"{}
\n
"
if
not
any
(
c
in
chat_template
for
c
in
JINJA_CHARS
):
msg
=
(
f
"The supplied chat template (
{
chat_template
}
) "
f
"looks like a file path, but it failed to be "
f
"opened. Reason:
{
e
}
"
)
raise
ValueError
(
msg
)
from
e
# If opening a file fails, set chat template to be args to
# ensure we decode so our escape are interpreted correctly
tokenizer
.
chat_template
=
codecs
.
decode
(
chat_template
,
"unicode_escape"
)
logger
.
info
(
"Using supplied chat template:
\n
%s"
,
tokenizer
.
chat_template
)
elif
tokenizer
.
chat_template
is
not
None
:
logger
.
info
(
"Using default chat template:
\n
%s"
,
tokenizer
.
chat_template
)
else
:
logger
.
warning
(
"No chat template provided. Chat API will not work."
)
def
_parse_chat_message_content
(
def
_parse_chat_message_content
(
self
,
self
,
...
@@ -358,35 +388,3 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -358,35 +388,3 @@ class OpenAIServingChat(OpenAIServing):
)
)
return
response
return
response
\ No newline at end of file
async
def
_load_chat_template
(
self
,
chat_template
:
Optional
[
str
]):
while
self
.
tokenizer
is
None
:
# Give the parent class time to load the tokenizer
await
asyncio
.
sleep
(
0.1
)
tokenizer
=
self
.
tokenizer
if
chat_template
is
not
None
:
try
:
with
open
(
chat_template
,
"r"
)
as
f
:
tokenizer
.
chat_template
=
f
.
read
()
except
OSError
as
e
:
JINJA_CHARS
=
"{}
\n
"
if
not
any
(
c
in
chat_template
for
c
in
JINJA_CHARS
):
msg
=
(
f
"The supplied chat template (
{
chat_template
}
) "
f
"looks like a file path, but it failed to be "
f
"opened. Reason:
{
e
}
"
)
raise
ValueError
(
msg
)
from
e
# If opening a file fails, set chat template to be args to
# ensure we decode so our escape are interpreted correctly
tokenizer
.
chat_template
=
codecs
.
decode
(
chat_template
,
"unicode_escape"
)
logger
.
info
(
"Using supplied chat template:
\n
%s"
,
tokenizer
.
chat_template
)
elif
tokenizer
.
chat_template
is
not
None
:
logger
.
info
(
"Using default chat template:
\n
%s"
,
tokenizer
.
chat_template
)
else
:
logger
.
warning
(
"No chat template provided. Chat API will not work."
)
vllm/entrypoints/openai/serving_completion.py
View file @
f12b20de
...
@@ -4,6 +4,7 @@ from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
...
@@ -4,6 +4,7 @@ from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
from
fastapi
import
Request
from
fastapi
import
Request
from
vllm.config
import
ModelConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponse
,
...
@@ -52,11 +53,11 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]:
...
@@ -52,11 +53,11 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]:
class
OpenAIServingCompletion
(
OpenAIServing
):
class
OpenAIServingCompletion
(
OpenAIServing
):
def
__init__
(
self
,
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
engine
:
AsyncLLMEngine
,
served_model_names
:
List
[
str
],
served_model_names
:
List
[
str
],
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]
=
None
):
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]):
super
().
__init__
(
engine
=
engine
,
super
().
__init__
(
engine
=
engine
,
model_config
=
model_config
,
served_model_names
=
served_model_names
,
served_model_names
=
served_model_names
,
lora_modules
=
lora_modules
)
lora_modules
=
lora_modules
)
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
f12b20de
import
asyncio
import
json
import
json
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Any
,
Awaitable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
pydantic
import
Field
from
pydantic
import
Field
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
typing_extensions
import
Annotated
from
typing_extensions
import
Annotated
from
vllm.config
import
ModelConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
ErrorResponse
,
CompletionRequest
,
ErrorResponse
,
...
@@ -29,13 +28,24 @@ class LoRAModulePath:
...
@@ -29,13 +28,24 @@ class LoRAModulePath:
class
OpenAIServing
:
class
OpenAIServing
:
def
__init__
(
self
,
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
engine
:
AsyncLLMEngine
,
served_model_names
:
List
[
str
],
served_model_names
:
List
[
str
],
lora_modules
:
Optional
[
List
[
LoRAModulePath
]],
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]):
await_post_init
:
Optional
[
Awaitable
[
Any
]]
=
None
):
super
().
__init__
()
self
.
engine
=
engine
self
.
engine
=
engine
self
.
max_model_len
=
model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
self
.
tokenizer
=
get_tokenizer
(
model_config
.
tokenizer
,
tokenizer_mode
=
model_config
.
tokenizer_mode
,
tokenizer_revision
=
model_config
.
tokenizer_revision
,
trust_remote_code
=
model_config
.
trust_remote_code
,
truncation_side
=
"left"
)
self
.
served_model_names
=
served_model_names
self
.
served_model_names
=
served_model_names
if
lora_modules
is
None
:
if
lora_modules
is
None
:
self
.
lora_requests
=
[]
self
.
lora_requests
=
[]
else
:
else
:
...
@@ -47,38 +57,6 @@ class OpenAIServing:
...
@@ -47,38 +57,6 @@ class OpenAIServing:
)
for
i
,
lora
in
enumerate
(
lora_modules
,
start
=
1
)
)
for
i
,
lora
in
enumerate
(
lora_modules
,
start
=
1
)
]
]
self
.
max_model_len
=
0
# Lazy initialized
self
.
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]
try
:
event_loop
=
asyncio
.
get_running_loop
()
except
RuntimeError
:
event_loop
=
None
if
event_loop
is
not
None
and
event_loop
.
is_running
():
# If the current is instanced by Ray Serve,
# there is already a running event loop
event_loop
.
create_task
(
self
.
_post_init
(
await_post_init
))
else
:
# When using single vLLM without engine_use_ray
asyncio
.
run
(
self
.
_post_init
(
await_post_init
))
async
def
_post_init
(
self
,
await_post_init
):
engine_model_config
=
await
self
.
engine
.
get_model_config
()
self
.
max_model_len
=
engine_model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
self
.
tokenizer
=
get_tokenizer
(
engine_model_config
.
tokenizer
,
tokenizer_mode
=
engine_model_config
.
tokenizer_mode
,
tokenizer_revision
=
engine_model_config
.
tokenizer_revision
,
trust_remote_code
=
engine_model_config
.
trust_remote_code
,
truncation_side
=
"left"
)
if
await_post_init
is
not
None
:
await
await_post_init
async
def
show_available_models
(
self
)
->
ModelList
:
async
def
show_available_models
(
self
)
->
ModelList
:
"""Show available models. Right now we only have one model."""
"""Show available models. Right now we only have one model."""
model_cards
=
[
model_cards
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment