Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
14cc317b
Unverified
Commit
14cc317b
authored
Jan 17, 2024
by
FlorianJoncour
Committed by
GitHub
Jan 16, 2024
Browse files
OpenAI Server refactoring (#2360)
parent
e1957c6e
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
954 additions
and
643 deletions
+954
-643
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+3
-0
requirements-dev.txt
requirements-dev.txt
+3
-0
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+17
-19
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+193
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+25
-624
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+288
-0
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+295
-0
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+130
-0
No files found.
.buildkite/test-pipeline.yaml
View file @
14cc317b
...
...
@@ -19,6 +19,9 @@ steps:
-
label
:
Engine Test
command
:
pytest -v -s engine
-
label
:
Entrypoints Test
command
:
pytest -v -s entrypoints
-
label
:
Kernels Test
command
:
pytest -v -s kernels
soft_fail
:
true
...
...
requirements-dev.txt
View file @
14cc317b
...
...
@@ -16,3 +16,6 @@ pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
openai
requests
ray
\ No newline at end of file
tests/async_engine/test_
openai_server
.py
→
tests/async_engine/test_
chat_template
.py
View file @
14cc317b
from
argparse
import
Namespace
from
dataclasses
import
dataclass
import
os
import
pathlib
import
pytest
from
fastapi.testclient
import
TestClient
from
vllm.entrypoints.openai.api_server
import
*
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
...
...
@@ -48,7 +48,6 @@ TEST_MESSAGES = [
'content'
:
'What is the capital of'
},
]
client
=
TestClient
(
app
)
@
dataclass
...
...
@@ -56,13 +55,17 @@ class MockTokenizer:
chat_template
=
None
@
dataclass
class
MockServingChat
:
tokenizer
:
MockTokenizer
def
test_load_chat_template
():
# Testing chatml template
mock_args
=
Namespace
(
chat_template
=
chatml_jinja_path
)
tokenizer
=
MockTokenizer
()
# Call the function with the mocked args
load_chat_template
(
mock_args
,
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
template_content
=
tokenizer
.
chat_template
...
...
@@ -76,11 +79,11 @@ def test_load_chat_template():
def
test_no_load_chat_template
():
# Testing chatml template
template
=
"../../examples/does_not_exist"
mock_args
=
Namespace
(
chat_template
=
template
)
tokenizer
=
MockTokenizer
()
# Call the function with the mocked args
load_chat_template
(
mock_args
,
tokenizer
=
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
# Test assertions
...
...
@@ -97,9 +100,9 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output
):
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
mock_args
=
Namespace
(
chat_template
=
template
)
load_chat_template
(
mock_args
,
tokenizer
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
...
...
@@ -115,8 +118,3 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
# Test assertion
assert
result
==
expected_output
,
f
"The generated prompt does not match the expected output for model
{
model
}
and template
{
template
}
"
def
test_health_endpoint
():
response
=
client
.
get
(
"/health"
)
assert
response
.
status_code
==
200
tests/entrypoints/test_openai_server.py
0 → 100644
View file @
14cc317b
import
time
import
subprocess
import
sys
import
pytest
import
requests
import
ray
# using Ray for overall ease of process management, parallel requests, and debugging.
import
openai
# use the official client for correctness check
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# any model with a chat template should work here
pytestmark
=
pytest
.
mark
.
asyncio
@
ray
.
remote
(
num_gpus
=
1
)
class
ServerRunner
:
def
__init__
(
self
,
args
):
self
.
proc
=
subprocess
.
Popen
(
[
"python3"
,
"-m"
,
"vllm.entrypoints.openai.api_server"
]
+
args
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
)
self
.
_wait_for_server
()
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
"http://localhost:8000/health"
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
"bfloat16"
,
# use half precision for speed and memory savings in CI environment
"--max-model-len"
,
"8192"
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
)
assert
chat_completion
.
id
is
not
None
assert
chat_completion
.
choices
is
not
None
and
len
(
chat_completion
.
choices
)
==
1
assert
chat_completion
.
choices
[
0
].
message
is
not
None
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
async
def
test_completion_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
single_usage
=
single_completion
.
usage
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
usage
==
single_usage
assert
""
.
join
(
chunks
)
==
single_output
async
def
test_chat_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
""
.
join
(
chunks
)
==
output
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
vllm/entrypoints/openai/api_server.py
View file @
14cc317b
This diff is collapsed.
Click to expand it.
vllm/entrypoints/openai/serving_chat.py
0 → 100644
View file @
14cc317b
import
time
import
codecs
from
fastapi
import
Request
from
typing
import
AsyncGenerator
,
AsyncIterator
,
Union
from
vllm.logger
import
init_logger
from
vllm.utils
import
random_uuid
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionResponseChoice
,
ChatCompletionResponseStreamChoice
,
ChatCompletionStreamResponse
,
ChatMessage
,
DeltaMessage
,
ErrorResponse
,
UsageInfo
)
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
logger
=
init_logger
(
__name__
)
class
OpenAIServingChat
(
OpenAIServing
):
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
served_model
:
str
,
response_role
:
str
,
chat_template
=
None
):
super
().
__init__
(
engine
=
engine
,
served_model
=
served_model
)
self
.
response_role
=
response_role
self
.
_load_chat_template
(
chat_template
)
async
def
create_chat_completion
(
self
,
request
:
ChatCompletionRequest
,
raw_request
:
Request
)
->
Union
[
ErrorResponse
,
AsyncGenerator
[
str
,
None
],
ChatCompletionResponse
]:
"""Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/chat/create
for the API specification. This API mimics the OpenAI ChatCompletion API.
NOTE: Currently we do not support the following features:
- function_call (Users should implement this by themselves)
- logit_bias (to be supported by vLLM engine)
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
if
request
.
logit_bias
is
not
None
and
len
(
request
.
logit_bias
)
>
0
:
# TODO: support logit_bias in vLLM engine.
return
self
.
create_error_response
(
"logit_bias is not currently supported"
)
try
:
prompt
=
self
.
tokenizer
.
apply_chat_template
(
conversation
=
request
.
messages
,
tokenize
=
False
,
add_generation_prompt
=
request
.
add_generation_prompt
)
except
Exception
as
e
:
logger
.
error
(
f
"Error in applying chat template from request:
{
str
(
e
)
}
"
)
return
self
.
create_error_response
(
str
(
e
))
token_ids
,
error_check_ret
=
await
self
.
_check_length
(
request
,
prompt
=
prompt
)
if
error_check_ret
is
not
None
:
return
error_check_ret
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
try
:
spaces_between_special_tokens
=
request
.
spaces_between_special_tokens
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
presence_penalty
=
request
.
presence_penalty
,
frequency_penalty
=
request
.
frequency_penalty
,
repetition_penalty
=
request
.
repetition_penalty
,
temperature
=
request
.
temperature
,
top_p
=
request
.
top_p
,
min_p
=
request
.
min_p
,
stop
=
request
.
stop
,
stop_token_ids
=
request
.
stop_token_ids
,
max_tokens
=
request
.
max_tokens
,
best_of
=
request
.
best_of
,
top_k
=
request
.
top_k
,
ignore_eos
=
request
.
ignore_eos
,
use_beam_search
=
request
.
use_beam_search
,
skip_special_tokens
=
request
.
skip_special_tokens
,
spaces_between_special_tokens
=
spaces_between_special_tokens
,
)
except
ValueError
as
e
:
return
self
.
create_error_response
(
str
(
e
))
result_generator
=
self
.
engine
.
generate
(
prompt
,
sampling_params
,
request_id
,
token_ids
)
# Streaming response
if
request
.
stream
:
return
self
.
chat_completion_stream_generator
(
request
,
result_generator
,
request_id
)
else
:
return
await
self
.
chat_completion_full_generator
(
request
,
raw_request
,
result_generator
,
request_id
)
def
get_chat_request_role
(
self
,
request
:
ChatCompletionRequest
)
->
str
:
if
request
.
add_generation_prompt
:
return
self
.
response_role
else
:
return
request
.
messages
[
-
1
].
role
async
def
chat_completion_stream_generator
(
self
,
request
:
ChatCompletionRequest
,
result_generator
:
AsyncIterator
[
RequestOutput
],
request_id
:
str
)
->
Union
[
ErrorResponse
,
AsyncGenerator
[
str
,
None
]]:
model_name
=
request
.
model
created_time
=
int
(
time
.
monotonic
())
chunk_object_type
=
"chat.completion.chunk"
# Send first response for each request.n (index) with the role
role
=
self
.
get_chat_request_role
(
request
)
for
i
in
range
(
request
.
n
):
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
DeltaMessage
(
role
=
role
),
finish_reason
=
None
)
chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
data
=
chunk
.
json
(
exclude_unset
=
True
,
ensure_ascii
=
False
)
yield
f
"data:
{
data
}
\n\n
"
# Send response to echo the input portion of the last message
if
request
.
echo
:
last_msg_content
=
""
if
request
.
messages
and
isinstance
(
request
.
messages
,
list
)
and
request
.
messages
[
-
1
].
get
(
"content"
)
and
request
.
messages
[
-
1
].
get
(
"role"
)
==
role
:
last_msg_content
=
request
.
messages
[
-
1
][
"content"
]
if
last_msg_content
:
for
i
in
range
(
request
.
n
):
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
DeltaMessage
(
content
=
last_msg_content
),
finish_reason
=
None
)
chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
data
=
chunk
.
json
(
exclude_unset
=
True
,
ensure_ascii
=
False
)
yield
f
"data:
{
data
}
\n\n
"
# Send response for each token for each request.n (index)
previous_texts
=
[
""
]
*
request
.
n
previous_num_tokens
=
[
0
]
*
request
.
n
finish_reason_sent
=
[
False
]
*
request
.
n
async
for
res
in
result_generator
:
res
:
RequestOutput
for
output
in
res
.
outputs
:
i
=
output
.
index
if
finish_reason_sent
[
i
]:
continue
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
if
output
.
finish_reason
is
None
:
# Send token-by-token response for each request.n
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
DeltaMessage
(
content
=
delta_text
),
finish_reason
=
None
)
chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
data
=
chunk
.
json
(
exclude_unset
=
True
,
ensure_ascii
=
False
)
yield
f
"data:
{
data
}
\n\n
"
else
:
# Send the finish response for each request.n only once
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
previous_num_tokens
[
i
],
total_tokens
=
prompt_tokens
+
previous_num_tokens
[
i
],
)
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
DeltaMessage
(
content
=
delta_text
),
finish_reason
=
output
.
finish_reason
)
chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
if
final_usage
is
not
None
:
chunk
.
usage
=
final_usage
data
=
chunk
.
json
(
exclude_unset
=
True
,
exclude_none
=
True
,
ensure_ascii
=
False
)
yield
f
"data:
{
data
}
\n\n
"
finish_reason_sent
[
i
]
=
True
# Send the final done message after all response.n are finished
yield
"data: [DONE]
\n\n
"
async
def
chat_completion_full_generator
(
self
,
request
:
ChatCompletionRequest
,
raw_request
:
Request
,
result_generator
:
AsyncIterator
[
RequestOutput
],
request_id
:
str
)
->
Union
[
ErrorResponse
,
ChatCompletionResponse
]:
model_name
=
request
.
model
created_time
=
int
(
time
.
monotonic
())
final_res
:
RequestOutput
=
None
async
for
res
in
result_generator
:
if
await
raw_request
.
is_disconnected
():
# Abort the request if the client disconnects.
await
self
.
engine
.
abort
(
request_id
)
return
self
.
create_error_response
(
"Client disconnected"
)
final_res
=
res
assert
final_res
is
not
None
choices
=
[]
role
=
self
.
get_chat_request_role
(
request
)
for
output
in
final_res
.
outputs
:
choice_data
=
ChatCompletionResponseChoice
(
index
=
output
.
index
,
message
=
ChatMessage
(
role
=
role
,
content
=
output
.
text
),
finish_reason
=
output
.
finish_reason
,
)
choices
.
append
(
choice_data
)
if
request
.
echo
:
last_msg_content
=
""
if
request
.
messages
and
isinstance
(
request
.
messages
,
list
)
and
request
.
messages
[
-
1
].
get
(
"content"
)
and
request
.
messages
[
-
1
].
get
(
"role"
)
==
role
:
last_msg_content
=
request
.
messages
[
-
1
][
"content"
]
for
choice
in
choices
:
full_message
=
last_msg_content
+
choice
.
message
.
content
choice
.
message
.
content
=
full_message
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_generated_tokens
=
sum
(
len
(
output
.
token_ids
)
for
output
in
final_res
.
outputs
)
usage
=
UsageInfo
(
prompt_tokens
=
num_prompt_tokens
,
completion_tokens
=
num_generated_tokens
,
total_tokens
=
num_prompt_tokens
+
num_generated_tokens
,
)
response
=
ChatCompletionResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
choices
,
usage
=
usage
,
)
return
response
def
_load_chat_template
(
self
,
chat_template
):
if
chat_template
is
not
None
:
try
:
with
open
(
chat_template
,
"r"
)
as
f
:
self
.
tokenizer
.
chat_template
=
f
.
read
()
except
OSError
:
# If opening a file fails, set chat template to be args to
# ensure we decode so our escape are interpreted correctly
self
.
tokenizer
.
chat_template
=
codecs
.
decode
(
chat_template
,
"unicode_escape"
)
logger
.
info
(
f
"Using supplied chat template:
\n
{
self
.
tokenizer
.
chat_template
}
"
)
elif
self
.
tokenizer
.
chat_template
is
not
None
:
logger
.
info
(
f
"Using default chat template:
\n
{
self
.
tokenizer
.
chat_template
}
"
)
else
:
logger
.
warning
(
"No chat template provided. Chat API will not work."
)
vllm/entrypoints/openai/serving_completion.py
0 → 100644
View file @
14cc317b
import
time
from
fastapi
import
Request
from
typing
import
AsyncGenerator
,
Optional
from
vllm.logger
import
init_logger
from
vllm.utils
import
random_uuid
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
LogProbs
,
UsageInfo
)
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
logger
=
init_logger
(
__name__
)
class
OpenAIServingCompletion
(
OpenAIServing
):
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
served_model
:
str
):
super
().
__init__
(
engine
=
engine
,
served_model
=
served_model
)
async
def
create_completion
(
self
,
request
:
CompletionRequest
,
raw_request
:
Request
):
"""Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/completions/create
for the API specification. This API mimics the OpenAI Completion API.
NOTE: Currently we do not support the following features:
- suffix (the language models we currently support do not support
suffix)
- logit_bias (to be supported by vLLM engine)
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
# OpenAI API supports echoing the prompt when max_tokens is 0.
echo_without_generation
=
request
.
echo
and
request
.
max_tokens
==
0
if
request
.
suffix
is
not
None
:
# The language models we currently support do not support suffix.
return
self
.
create_error_response
(
"suffix is not currently supported"
)
if
request
.
logit_bias
is
not
None
and
len
(
request
.
logit_bias
)
>
0
:
# TODO: support logit_bias in vLLM engine.
return
self
.
create_error_response
(
"logit_bias is not currently supported"
)
model_name
=
request
.
model
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
use_token_ids
=
False
if
isinstance
(
request
.
prompt
,
list
):
if
len
(
request
.
prompt
)
==
0
:
return
self
.
create_error_response
(
"please provide at least one prompt"
)
first_element
=
request
.
prompt
[
0
]
if
isinstance
(
first_element
,
int
):
use_token_ids
=
True
prompt
=
request
.
prompt
elif
isinstance
(
first_element
,
(
str
,
list
)):
# TODO: handles multiple prompt case in list[list[int]]
if
len
(
request
.
prompt
)
>
1
:
return
self
.
create_error_response
(
"multiple prompts in a batch is not currently supported"
)
use_token_ids
=
not
isinstance
(
first_element
,
str
)
prompt
=
request
.
prompt
[
0
]
else
:
prompt
=
request
.
prompt
if
use_token_ids
:
_
,
error_check_ret
=
await
self
.
_check_length
(
request
,
prompt_ids
=
prompt
)
else
:
token_ids
,
error_check_ret
=
await
self
.
_check_length
(
request
,
prompt
=
prompt
)
if
error_check_ret
is
not
None
:
return
error_check_ret
created_time
=
int
(
time
.
monotonic
())
try
:
spaces_between_special_tokens
=
request
.
spaces_between_special_tokens
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
best_of
=
request
.
best_of
,
presence_penalty
=
request
.
presence_penalty
,
frequency_penalty
=
request
.
frequency_penalty
,
repetition_penalty
=
request
.
repetition_penalty
,
temperature
=
request
.
temperature
,
top_p
=
request
.
top_p
,
top_k
=
request
.
top_k
,
min_p
=
request
.
min_p
,
stop
=
request
.
stop
,
stop_token_ids
=
request
.
stop_token_ids
,
ignore_eos
=
request
.
ignore_eos
,
max_tokens
=
request
.
max_tokens
if
not
echo_without_generation
else
1
,
logprobs
=
request
.
logprobs
,
use_beam_search
=
request
.
use_beam_search
,
prompt_logprobs
=
request
.
logprobs
if
request
.
echo
else
None
,
skip_special_tokens
=
request
.
skip_special_tokens
,
spaces_between_special_tokens
=
spaces_between_special_tokens
,
)
except
ValueError
as
e
:
return
self
.
create_error_response
(
str
(
e
))
if
use_token_ids
:
result_generator
=
self
.
engine
.
generate
(
None
,
sampling_params
,
request_id
,
prompt_token_ids
=
prompt
)
else
:
result_generator
=
self
.
engine
.
generate
(
prompt
,
sampling_params
,
request_id
,
token_ids
)
# Similar to the OpenAI API, when n != best_of, we do not stream the
# results. In addition, we do not stream the results when use beam search.
stream
=
(
request
.
stream
and
(
request
.
best_of
is
None
or
request
.
n
==
request
.
best_of
)
and
not
request
.
use_beam_search
)
def
create_stream_response_json
(
index
:
int
,
text
:
str
,
logprobs
:
Optional
[
LogProbs
]
=
None
,
finish_reason
:
Optional
[
str
]
=
None
,
usage
:
Optional
[
UsageInfo
]
=
None
,
)
->
str
:
choice_data
=
CompletionResponseStreamChoice
(
index
=
index
,
text
=
text
,
logprobs
=
logprobs
,
finish_reason
=
finish_reason
,
)
response
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
[
choice_data
],
)
if
usage
is
not
None
:
response
.
usage
=
usage
response_json
=
response
.
json
(
exclude_unset
=
True
,
ensure_ascii
=
False
)
return
response_json
async
def
completion_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
previous_texts
=
[
""
]
*
request
.
n
previous_num_tokens
=
[
0
]
*
request
.
n
has_echoed
=
[
False
]
*
request
.
n
async
for
res
in
result_generator
:
res
:
RequestOutput
for
output
in
res
.
outputs
:
i
=
output
.
index
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
token_ids
=
output
.
token_ids
[
previous_num_tokens
[
i
]:]
if
request
.
logprobs
is
not
None
:
top_logprobs
=
output
.
logprobs
[
previous_num_tokens
[
i
]:]
else
:
top_logprobs
=
None
offsets
=
len
(
previous_texts
[
i
])
if
request
.
echo
and
not
has_echoed
[
i
]:
if
not
echo_without_generation
:
delta_text
=
res
.
prompt
+
delta_text
token_ids
=
res
.
prompt_token_ids
+
token_ids
if
top_logprobs
:
top_logprobs
=
res
.
prompt_logprobs
+
top_logprobs
else
:
# only just return the prompt
delta_text
=
res
.
prompt
token_ids
=
res
.
prompt_token_ids
if
top_logprobs
:
top_logprobs
=
res
.
prompt_logprobs
has_echoed
[
i
]
=
True
if
request
.
logprobs
is
not
None
:
logprobs
=
self
.
_create_logprobs
(
token_ids
=
token_ids
,
top_logprobs
=
top_logprobs
,
num_output_top_logprobs
=
request
.
logprobs
,
initial_text_offset
=
offsets
,
)
else
:
logprobs
=
None
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
finish_reason
=
output
.
finish_reason
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
delta_text
,
logprobs
=
logprobs
,
finish_reason
=
finish_reason
,
)
yield
f
"data:
{
response_json
}
\n\n
"
if
output
.
finish_reason
is
not
None
:
logprobs
=
(
LogProbs
()
if
request
.
logprobs
is
not
None
else
None
)
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
""
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
usage
=
final_usage
,
)
yield
f
"data:
{
response_json
}
\n\n
"
yield
"data: [DONE]
\n\n
"
# Streaming response
if
stream
:
return
completion_stream_generator
()
# Non-streaming response
final_res
:
RequestOutput
=
None
async
for
res
in
result_generator
:
if
await
raw_request
.
is_disconnected
():
# Abort the request if the client disconnects.
await
self
.
engine
.
abort
(
request_id
)
return
self
.
create_error_response
(
"Client disconnected"
)
final_res
=
res
assert
final_res
is
not
None
choices
=
[]
prompt_token_ids
=
final_res
.
prompt_token_ids
prompt_logprobs
=
final_res
.
prompt_logprobs
prompt_text
=
final_res
.
prompt
for
output
in
final_res
.
outputs
:
if
request
.
logprobs
is
not
None
:
if
not
echo_without_generation
:
token_ids
=
output
.
token_ids
top_logprobs
=
output
.
logprobs
if
request
.
echo
:
token_ids
=
prompt_token_ids
+
token_ids
top_logprobs
=
prompt_logprobs
+
top_logprobs
else
:
token_ids
=
prompt_token_ids
top_logprobs
=
prompt_logprobs
logprobs
=
self
.
_create_logprobs
(
token_ids
=
token_ids
,
top_logprobs
=
top_logprobs
,
num_output_top_logprobs
=
request
.
logprobs
,
)
else
:
logprobs
=
None
if
not
echo_without_generation
:
output_text
=
output
.
text
if
request
.
echo
:
output_text
=
prompt_text
+
output_text
else
:
output_text
=
prompt_text
choice_data
=
CompletionResponseChoice
(
index
=
output
.
index
,
text
=
output_text
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
)
choices
.
append
(
choice_data
)
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_generated_tokens
=
sum
(
len
(
output
.
token_ids
)
for
output
in
final_res
.
outputs
)
usage
=
UsageInfo
(
prompt_tokens
=
num_prompt_tokens
,
completion_tokens
=
num_generated_tokens
,
total_tokens
=
num_prompt_tokens
+
num_generated_tokens
,
)
response
=
CompletionResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
choices
,
usage
=
usage
,
)
if
request
.
stream
:
# When user requests streaming but we don't stream, we still need to
# return a streaming response with a single event.
response_json
=
response
.
json
(
ensure_ascii
=
False
)
async
def
fake_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
yield
f
"data:
{
response_json
}
\n\n
"
yield
"data: [DONE]
\n\n
"
return
fake_stream_generator
()
return
response
vllm/entrypoints/openai/serving_engine.py
0 → 100644
View file @
14cc317b
import
asyncio
from
http
import
HTTPStatus
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
ChatCompletionRequest
,
ErrorResponse
,
LogProbs
,
ModelCard
,
ModelList
,
ModelPermission
)
logger
=
init_logger
(
__name__
)
class
OpenAIServing
:
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
served_model
:
str
):
self
.
engine
=
engine
self
.
served_model
=
served_model
self
.
max_model_len
=
0
self
.
tokenizer
=
None
try
:
event_loop
=
asyncio
.
get_running_loop
()
except
RuntimeError
:
event_loop
=
None
if
event_loop
is
not
None
and
event_loop
.
is_running
(
):
# If the current is instanced by Ray Serve, there is already a running event loop
event_loop
.
create_task
(
self
.
_post_init
())
else
:
# When using single vLLM without engine_use_ray
asyncio
.
run
(
self
.
_post_init
())
async
def
_post_init
(
self
):
engine_model_config
=
await
self
.
engine
.
get_model_config
()
self
.
max_model_len
=
engine_model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
self
.
tokenizer
=
get_tokenizer
(
engine_model_config
.
tokenizer
,
tokenizer_mode
=
engine_model_config
.
tokenizer_mode
,
trust_remote_code
=
engine_model_config
.
trust_remote_code
)
async
def
show_available_models
(
self
)
->
ModelList
:
"""Show available models. Right now we only have one model."""
model_cards
=
[
ModelCard
(
id
=
self
.
served_model
,
root
=
self
.
served_model
,
permission
=
[
ModelPermission
()])
]
return
ModelList
(
data
=
model_cards
)
def
_create_logprobs
(
self
,
token_ids
:
List
[
int
],
top_logprobs
:
Optional
[
List
[
Optional
[
Dict
[
int
,
float
]]]]
=
None
,
num_output_top_logprobs
:
Optional
[
int
]
=
None
,
initial_text_offset
:
int
=
0
,
)
->
LogProbs
:
"""Create OpenAI-style logprobs."""
logprobs
=
LogProbs
()
last_token_len
=
0
if
num_output_top_logprobs
:
logprobs
.
top_logprobs
=
[]
for
i
,
token_id
in
enumerate
(
token_ids
):
step_top_logprobs
=
top_logprobs
[
i
]
if
step_top_logprobs
is
not
None
:
token_logprob
=
step_top_logprobs
[
token_id
]
else
:
token_logprob
=
None
token
=
self
.
tokenizer
.
convert_ids_to_tokens
(
token_id
)
logprobs
.
tokens
.
append
(
token
)
logprobs
.
token_logprobs
.
append
(
token_logprob
)
if
len
(
logprobs
.
text_offset
)
==
0
:
logprobs
.
text_offset
.
append
(
initial_text_offset
)
else
:
logprobs
.
text_offset
.
append
(
logprobs
.
text_offset
[
-
1
]
+
last_token_len
)
last_token_len
=
len
(
token
)
if
num_output_top_logprobs
:
logprobs
.
top_logprobs
.
append
({
self
.
tokenizer
.
convert_ids_to_tokens
(
i
):
p
for
i
,
p
in
step_top_logprobs
.
items
()
}
if
step_top_logprobs
else
None
)
return
logprobs
def
create_error_response
(
self
,
message
:
str
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
)
->
ErrorResponse
:
return
ErrorResponse
(
message
=
message
,
type
=
err_type
,
code
=
status_code
.
value
)
async
def
_check_model
(
self
,
request
)
->
Optional
[
ErrorResponse
]:
if
request
.
model
==
self
.
served_model
:
return
return
self
.
create_error_response
(
message
=
f
"The model `
{
request
.
model
}
` does not exist."
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
)
async
def
_check_length
(
self
,
request
:
Union
[
ChatCompletionRequest
,
CompletionRequest
],
prompt
:
Optional
[
str
]
=
None
,
prompt_ids
:
Optional
[
List
[
int
]]
=
None
)
->
Tuple
[
List
[
int
],
Optional
[
ErrorResponse
]]:
assert
(
not
(
prompt
is
None
and
prompt_ids
is
None
)
and
not
(
prompt
is
not
None
and
prompt_ids
is
not
None
)
),
"Either prompt or prompt_ids should be provided."
input_ids
=
prompt_ids
if
prompt_ids
is
not
None
else
self
.
tokenizer
(
prompt
).
input_ids
token_num
=
len
(
input_ids
)
if
request
.
max_tokens
is
None
:
request
.
max_tokens
=
self
.
max_model_len
-
token_num
if
token_num
+
request
.
max_tokens
>
self
.
max_model_len
:
return
input_ids
,
self
.
create_error_response
(
f
"This model's maximum context length is
{
self
.
max_model_len
}
tokens. "
f
"However, you requested
{
request
.
max_tokens
+
token_num
}
tokens "
f
"(
{
token_num
}
in the messages, "
f
"
{
request
.
max_tokens
}
in the completion). "
f
"Please reduce the length of the messages or completion."
,
)
else
:
return
input_ids
,
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment