Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
665cbcec
"src/vscode:/vscode.git/clone" did not exist on "ae513aa89e594d94739f0f7f2bcffa98a57d08a7"
Unverified
Commit
665cbcec
authored
Nov 27, 2023
by
Yunmo Chen
Committed by
GitHub
Nov 26, 2023
Browse files
Added echo function to OpenAI API server. (#1504)
parent
7c600440
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
71 additions
and
24 deletions
+71
-24
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+70
-22
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+1
-2
No files found.
vllm/entrypoints/openai/api_server.py
View file @
665cbcec
...
...
@@ -160,16 +160,26 @@ async def show_available_models():
return
ModelList
(
data
=
model_cards
)
def
create_logprobs
(
token_ids
:
List
[
int
],
id_logprobs
:
List
[
Dict
[
int
,
float
]],
initial_text_offset
:
int
=
0
)
->
LogProbs
:
def
create_logprobs
(
token_ids
:
List
[
int
],
top_logprobs
:
Optional
[
List
[
Optional
[
Dict
[
int
,
float
]]]]
=
None
,
num_output_top_logprobs
:
Optional
[
int
]
=
None
,
initial_text_offset
:
int
=
0
,
)
->
LogProbs
:
"""Create OpenAI-style logprobs."""
logprobs
=
LogProbs
()
last_token_len
=
0
for
token_id
,
id_logprob
in
zip
(
token_ids
,
id_logprobs
):
if
num_output_top_logprobs
:
logprobs
.
top_logprobs
=
[]
for
i
,
token_id
in
enumerate
(
token_ids
):
step_top_logprobs
=
top_logprobs
[
i
]
if
step_top_logprobs
is
not
None
:
token_logprob
=
step_top_logprobs
[
token_id
]
else
:
token_logprob
=
None
token
=
tokenizer
.
convert_ids_to_tokens
(
token_id
)
logprobs
.
tokens
.
append
(
token
)
logprobs
.
token_logprobs
.
append
(
id
_logprob
[
token_id
]
)
logprobs
.
token_logprobs
.
append
(
token
_logprob
)
if
len
(
logprobs
.
text_offset
)
==
0
:
logprobs
.
text_offset
.
append
(
initial_text_offset
)
else
:
...
...
@@ -177,10 +187,11 @@ def create_logprobs(token_ids: List[int],
last_token_len
)
last_token_len
=
len
(
token
)
if
num_output_top_logprobs
:
logprobs
.
top_logprobs
.
append
({
tokenizer
.
convert_ids_to_tokens
(
i
):
p
for
i
,
p
in
id
_logprob
.
items
()
}
)
for
i
,
p
in
step_top
_logprob
s
.
items
()
}
if
step_top_logprobs
else
None
)
return
logprobs
...
...
@@ -371,8 +382,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
for the API specification. This API mimics the OpenAI Completion API.
NOTE: Currently we do not support the following features:
- echo (since the vLLM engine does not currently support
getting the logprobs of prompt tokens)
- suffix (the language models we currently support do not support
suffix)
- logit_bias (to be supported by vLLM engine)
...
...
@@ -383,11 +392,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
if
error_check_ret
is
not
None
:
return
error_check_ret
if
request
.
echo
:
# We do not support echo since the vLLM engine does not
# currently support getting the logprobs of prompt tokens.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"echo is not currently supported"
)
# OpenAI API supports echoing the prompt when max_tokens is 0.
echo_without_generation
=
request
.
echo
and
request
.
max_tokens
==
0
if
request
.
suffix
is
not
None
:
# The language models we currently support do not support suffix.
...
...
@@ -443,9 +449,11 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
stop
=
request
.
stop
,
stop_token_ids
=
request
.
stop_token_ids
,
ignore_eos
=
request
.
ignore_eos
,
max_tokens
=
request
.
max_tokens
,
max_tokens
=
request
.
max_tokens
if
not
echo_without_generation
else
1
,
logprobs
=
request
.
logprobs
,
use_beam_search
=
request
.
use_beam_search
,
prompt_logprobs
=
request
.
logprobs
if
request
.
echo
else
None
,
skip_special_tokens
=
request
.
skip_special_tokens
,
spaces_between_special_tokens
=
spaces_between_special_tokens
,
)
...
...
@@ -495,24 +503,42 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
async
def
completion_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
previous_texts
=
[
""
]
*
request
.
n
previous_num_tokens
=
[
0
]
*
request
.
n
has_echoed
=
[
False
]
*
request
.
n
async
for
res
in
result_generator
:
res
:
RequestOutput
for
output
in
res
.
outputs
:
i
=
output
.
index
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
token_ids
=
output
.
token_ids
[
previous_num_tokens
[
i
]:]
top_logprobs
=
output
.
logprobs
[
previous_num_tokens
[
i
]:]
offsets
=
len
(
previous_texts
[
i
])
if
request
.
echo
and
not
has_echoed
[
i
]:
if
not
echo_without_generation
:
delta_text
=
res
.
prompt
+
delta_text
token_ids
=
res
.
prompt_token_ids
+
token_ids
top_logprobs
=
res
.
prompt_logprobs
+
top_logprobs
else
:
delta_text
=
res
.
prompt
token_ids
=
res
.
prompt_token_ids
top_logprobs
=
res
.
prompt_logprobs
has_echoed
[
i
]
=
True
if
request
.
logprobs
is
not
None
:
logprobs
=
create_logprobs
(
output
.
token_ids
[
previous_num_tokens
[
i
]:],
output
.
logprobs
[
previous_num_tokens
[
i
]:],
len
(
previous_texts
[
i
]))
token_ids
=
token_ids
,
top_logprobs
=
top_logprobs
,
num_output_top_logprobs
=
request
.
logprobs
,
initial_text_offset
=
offsets
,
)
else
:
logprobs
=
None
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
finish_reason
=
output
.
finish_reason
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
delta_text
,
logprobs
=
logprobs
,
finish_reason
=
finish_reason
,
)
yield
f
"data:
{
response_json
}
\n\n
"
if
output
.
finish_reason
is
not
None
:
...
...
@@ -551,14 +577,36 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
final_res
=
res
assert
final_res
is
not
None
choices
=
[]
prompt_token_ids
=
final_res
.
prompt_token_ids
prompt_logprobs
=
final_res
.
prompt_logprobs
prompt_text
=
final_res
.
prompt
for
output
in
final_res
.
outputs
:
if
request
.
logprobs
is
not
None
:
logprobs
=
create_logprobs
(
output
.
token_ids
,
output
.
logprobs
)
if
not
echo_without_generation
:
token_ids
=
output
.
token_ids
top_logprobs
=
output
.
logprobs
if
request
.
echo
:
token_ids
=
prompt_token_ids
+
token_ids
top_logprobs
=
prompt_logprobs
+
top_logprobs
else
:
token_ids
=
prompt_token_ids
top_logprobs
=
prompt_logprobs
logprobs
=
create_logprobs
(
token_ids
=
token_ids
,
top_logprobs
=
top_logprobs
,
num_output_top_logprobs
=
request
.
logprobs
,
)
else
:
logprobs
=
None
if
not
echo_without_generation
:
output_text
=
output
.
text
if
request
.
echo
:
output_text
=
prompt_text
+
output_text
else
:
output_text
=
prompt_text
choice_data
=
CompletionResponseChoice
(
index
=
output
.
index
,
text
=
output
.
text
,
text
=
output
_
text
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
)
...
...
vllm/entrypoints/openai/protocol.py
View file @
665cbcec
...
...
@@ -106,8 +106,7 @@ class LogProbs(BaseModel):
text_offset
:
List
[
int
]
=
Field
(
default_factory
=
list
)
token_logprobs
:
List
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
tokens
:
List
[
str
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
List
[
Optional
[
Dict
[
str
,
float
]]]
=
Field
(
default_factory
=
list
)
top_logprobs
:
Optional
[
List
[
Optional
[
Dict
[
int
,
float
]]]]
=
None
class
CompletionResponseChoice
(
BaseModel
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment