Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
58f53034
Unverified
Commit
58f53034
authored
Jul 23, 2024
by
Yehoshua Cohen
Committed by
GitHub
Jul 23, 2024
Browse files
[Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652)
parent
0eb0757b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
78 additions
and
12 deletions
+78
-12
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+32
-8
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+46
-4
No files found.
tests/entrypoints/openai/test_chat.py
View file @
58f53034
...
@@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
# Test stream=True, stream_options={"include_usage": True,
stream
=
await
client
.
chat
.
completions
.
create
(
# "continuous_usage_stats": False}}
model
=
model_name
,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
})
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
...
@@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream
=
False
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
stream_options
=
{
"include_usage"
:
True
})
# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": True}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
},
)
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test
# (i.e. using the same ordering as in the Completions API tests), the test
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
58f53034
...
@@ -247,7 +247,15 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -247,7 +247,15 @@ class OpenAIServingChat(OpenAIServing):
model
=
model_name
)
model
=
model_name
)
if
(
request
.
stream_options
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
if
(
request
.
stream_options
.
continuous_usage_stats
):
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
0
,
total_tokens
=
prompt_tokens
)
chunk
.
usage
=
usage
else
:
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
yield
f
"data:
{
data
}
\n\n
"
...
@@ -277,7 +285,18 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -277,7 +285,18 @@ class OpenAIServingChat(OpenAIServing):
model
=
model_name
)
model
=
model_name
)
if
(
request
.
stream_options
and
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
if
(
request
.
stream_options
.
continuous_usage_stats
):
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
0
,
total_tokens
=
prompt_tokens
)
chunk
.
usage
=
usage
else
:
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
yield
f
"data:
{
data
}
\n\n
"
...
@@ -336,7 +355,19 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -336,7 +355,19 @@ class OpenAIServingChat(OpenAIServing):
model
=
model_name
)
model
=
model_name
)
if
(
request
.
stream_options
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
if
(
request
.
stream_options
.
continuous_usage_stats
):
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
chunk
.
usage
=
usage
else
:
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
yield
f
"data:
{
data
}
\n\n
"
else
:
else
:
...
@@ -356,7 +387,18 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -356,7 +387,18 @@ class OpenAIServingChat(OpenAIServing):
model
=
model_name
)
model
=
model_name
)
if
(
request
.
stream_options
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
if
(
request
.
stream_options
.
continuous_usage_stats
):
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
chunk
.
usage
=
usage
else
:
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
yield
f
"data:
{
data
}
\n\n
"
finish_reason_sent
[
i
]
=
True
finish_reason_sent
[
i
]
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment