Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
774d1035
Unverified
Commit
774d1035
authored
Jun 10, 2024
by
Itay Etelis
Committed by
GitHub
Jun 10, 2024
Browse files
[Feature][Frontend]: Continued `stream_options` implementation also in CompletionRequest (#5319)
parent
6b29d6fe
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
180 additions
and
126 deletions
+180
-126
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+132
-104
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+9
-0
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+17
-18
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+22
-4
No files found.
tests/entrypoints/test_openai_server.py
View file @
774d1035
...
...
@@ -478,8 +478,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
single_usage
=
single_completion
.
usage
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
...
...
@@ -495,7 +493,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
chunk
.
usage
==
single_usage
assert
""
.
join
(
chunks
)
==
single_output
...
...
@@ -550,6 +547,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_chat_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
}]
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
...
...
@@ -1343,106 +1472,5 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
assert
embeddings
.
usage
.
total_tokens
==
17
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=None
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
None
,
)
chunks
=
[]
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
assert
len
(
chunks
)
>
0
assert
"usage"
not
in
chunk
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
},
)
chunks
=
[]
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
assert
len
(
chunks
)
>
0
assert
"usage"
not
in
chunk
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
},
)
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
else
:
assert
chunk
.
usage
is
None
finish_reason_count
+=
1
# The last message should have usage and no choices
last_message
=
await
stream
.
__anext__
()
assert
last_message
.
usage
is
not
None
assert
last_message
.
usage
.
prompt_tokens
>
0
assert
last_message
.
usage
.
completion_tokens
>
0
assert
last_message
.
usage
.
total_tokens
==
(
last_message
.
usage
.
prompt_tokens
+
last_message
.
usage
.
completion_tokens
)
assert
last_message
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
},
)
# Test stream=False, stream_options={"include_usage": False}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
False
},
)
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
},
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
vllm/entrypoints/openai/protocol.py
View file @
774d1035
...
...
@@ -346,6 +346,7 @@ class CompletionRequest(OpenAIBaseModel):
le
=
torch
.
iinfo
(
torch
.
long
).
max
)
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
suffix
:
Optional
[
str
]
=
None
temperature
:
Optional
[
float
]
=
1.0
top_p
:
Optional
[
float
]
=
1.0
...
...
@@ -482,6 +483,14 @@ class CompletionRequest(OpenAIBaseModel):
" in the interval [0, 5]."
))
return
data
@
model_validator
(
mode
=
"before"
)
@
classmethod
def
validate_stream_options
(
cls
,
data
):
if
data
.
get
(
"stream_options"
)
and
not
data
.
get
(
"stream"
):
raise
ValueError
(
"Stream options can only be defined when stream is True."
)
return
data
class
EmbeddingRequest
(
BaseModel
):
# Ordered by official OpenAI API documentation
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
774d1035
...
...
@@ -441,25 +441,24 @@ class OpenAIServingChat(OpenAIServing):
yield
f
"data:
{
data
}
\n\n
"
finish_reason_sent
[
i
]
=
True
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
previous_num_tokens
[
i
],
total_tokens
=
prompt_tokens
+
previous_num_tokens
[
i
],
)
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
previous_num_tokens
[
i
],
total_tokens
=
prompt_tokens
+
previous_num_tokens
[
i
],
)
final_usage_chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[],
model
=
model_name
,
usage
=
final_usage
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
True
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
final_usage_chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[],
model
=
model_name
,
usage
=
final_usage
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
True
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
774d1035
...
...
@@ -264,7 +264,8 @@ class OpenAIServingCompletion(OpenAIServing):
)
else
:
final_usage
=
None
response_json
=
CompletionStreamResponse
(
chunk
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
...
...
@@ -276,10 +277,27 @@ class OpenAIServingCompletion(OpenAIServing):
finish_reason
=
finish_reason
,
stop_reason
=
stop_reason
,
)
],
usage
=
final_usage
,
).
model_dump_json
(
exclude_unset
=
True
)
])
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
response_json
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
response_json
}
\n\n
"
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
final_usage_chunk
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
[],
usage
=
final_usage
,
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
True
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
data
=
self
.
create_streaming_error_response
(
str
(
e
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment