Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
417fc72f
Unverified
Commit
417fc72f
authored
Mar 21, 2025
by
Yuhong Guo
Committed by
GitHub
Mar 20, 2025
Browse files
Align completion and chat_completion response to OpenAI API (#4637)
parent
c6ec7029
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
64 additions
and
6 deletions
+64
-6
python/sglang/srt/openai_api/adapter.py
python/sglang/srt/openai_api/adapter.py
+64
-6
No files found.
python/sglang/srt/openai_api/adapter.py
View file @
417fc72f
...
@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
...
@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
)
)
try
:
try
:
created
=
int
(
time
.
time
())
ret
=
await
tokenizer_manager
.
generate_request
(
adapted_request
).
__anext__
()
ret
=
await
tokenizer_manager
.
generate_request
(
adapted_request
).
__anext__
()
if
not
isinstance
(
ret
,
list
):
if
not
isinstance
(
ret
,
list
):
ret
=
[
ret
]
ret
=
[
ret
]
...
@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
...
@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
responses
=
v1_chat_generate_response
(
responses
=
v1_chat_generate_response
(
request
,
request
,
ret
,
ret
,
created
,
to_file
=
True
,
to_file
=
True
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
)
)
else
:
else
:
responses
=
v1_generate_response
(
responses
=
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
to_file
=
True
request
,
ret
,
tokenizer_manager
,
created
,
to_file
=
True
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
)
)
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -577,7 +584,9 @@ def v1_generate_request(
...
@@ -577,7 +584,9 @@ def v1_generate_request(
return
adapted_request
,
all_requests
if
len
(
all_requests
)
>
1
else
all_requests
[
0
]
return
adapted_request
,
all_requests
if
len
(
all_requests
)
>
1
else
all_requests
[
0
]
def
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
to_file
=
False
):
def
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
created
,
to_file
=
False
,
cache_report
=
False
):
choices
=
[]
choices
=
[]
echo
=
False
echo
=
False
...
@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
...
@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
# remain the same but if needed we can change that
# remain the same but if needed we can change that
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"object"
:
"text_completion"
,
"object"
:
"text_completion"
,
"created"
:
int
(
time
.
time
())
,
"created"
:
created
,
"model"
:
request
[
i
].
model
,
"model"
:
request
[
i
].
model
,
"choices"
:
choice
,
"choices"
:
choice
,
"usage"
:
{
"usage"
:
{
...
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
...
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
ret
[
i
][
"meta_info"
][
"prompt_tokens"
]
for
i
in
range
(
0
,
len
(
ret
),
request
.
n
)
ret
[
i
][
"meta_info"
][
"prompt_tokens"
]
for
i
in
range
(
0
,
len
(
ret
),
request
.
n
)
)
)
completion_tokens
=
sum
(
item
[
"meta_info"
][
"completion_tokens"
]
for
item
in
ret
)
completion_tokens
=
sum
(
item
[
"meta_info"
][
"completion_tokens"
]
for
item
in
ret
)
cached_tokens
=
sum
(
item
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
for
item
in
ret
)
response
=
CompletionResponse
(
response
=
CompletionResponse
(
id
=
ret
[
0
][
"meta_info"
][
"id"
],
id
=
ret
[
0
][
"meta_info"
][
"id"
],
model
=
request
.
model
,
model
=
request
.
model
,
created
=
created
,
choices
=
choices
,
choices
=
choices
,
usage
=
UsageInfo
(
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
prompt_tokens_details
=
(
{
"cached_tokens"
:
cached_tokens
}
if
cache_report
else
None
),
),
),
)
)
return
response
return
response
...
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
...
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
async
def
v1_completions
(
tokenizer_manager
,
raw_request
:
Request
):
async
def
v1_completions
(
tokenizer_manager
,
raw_request
:
Request
):
request_json
=
await
raw_request
.
json
()
request_json
=
await
raw_request
.
json
()
all_requests
=
[
CompletionRequest
(
**
request_json
)]
all_requests
=
[
CompletionRequest
(
**
request_json
)]
created
=
int
(
time
.
time
())
adapted_request
,
request
=
v1_generate_request
(
all_requests
)
adapted_request
,
request
=
v1_generate_request
(
all_requests
)
if
adapted_request
.
stream
:
if
adapted_request
.
stream
:
...
@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
n_prev_tokens
=
{}
n_prev_tokens
=
{}
prompt_tokens
=
{}
prompt_tokens
=
{}
completion_tokens
=
{}
completion_tokens
=
{}
cached_tokens
=
{}
try
:
try
:
async
for
content
in
tokenizer_manager
.
generate_request
(
async
for
content
in
tokenizer_manager
.
generate_request
(
adapted_request
,
raw_request
adapted_request
,
raw_request
...
@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
text
=
content
[
"text"
]
text
=
content
[
"text"
]
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
cached_tokens
[
index
]
=
content
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
if
not
stream_buffer
:
# The first chunk
if
not
stream_buffer
:
# The first chunk
if
request
.
echo
:
if
request
.
echo
:
...
@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
CompletionStreamResponse
(
chunk
=
CompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
object
=
"text_completion"
,
object
=
"text_completion"
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
...
@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
total_completion_tokens
=
sum
(
total_completion_tokens
=
sum
(
tokens
for
tokens
in
completion_tokens
.
values
()
tokens
for
tokens
in
completion_tokens
.
values
()
)
)
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
if
cache_report
:
cached_tokens_sum
=
sum
(
tokens
for
tokens
in
cached_tokens
.
values
()
)
prompt_tokens_details
=
{
"cached_tokens"
:
cached_tokens_sum
}
else
:
prompt_tokens_details
=
None
usage
=
UsageInfo
(
usage
=
UsageInfo
(
prompt_tokens
=
total_prompt_tokens
,
prompt_tokens
=
total_prompt_tokens
,
completion_tokens
=
total_completion_tokens
,
completion_tokens
=
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
prompt_tokens_details
=
prompt_tokens_details
,
)
)
final_usage_chunk
=
CompletionStreamResponse
(
final_usage_chunk
=
CompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[],
choices
=
[],
model
=
request
.
model
,
model
=
request
.
model
,
usage
=
usage
,
usage
=
usage
,
...
@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
if
not
isinstance
(
ret
,
list
):
if
not
isinstance
(
ret
,
list
):
ret
=
[
ret
]
ret
=
[
ret
]
response
=
v1_generate_response
(
request
,
ret
,
tokenizer_manager
)
response
=
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
created
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
)
return
response
return
response
...
@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
...
@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
def
v1_chat_generate_response
(
def
v1_chat_generate_response
(
request
,
request
,
ret
,
ret
,
created
,
to_file
=
False
,
to_file
=
False
,
cache_report
=
False
,
cache_report
=
False
,
tool_call_parser
=
None
,
tool_call_parser
=
None
,
...
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
...
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
# remain the same but if needed we can change that
# remain the same but if needed we can change that
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"object"
:
"chat.completion"
,
"object"
:
"chat.completion"
,
"created"
:
int
(
time
.
time
())
,
"created"
:
created
,
"model"
:
request
[
i
].
model
,
"model"
:
request
[
i
].
model
,
"choices"
:
choice
,
"choices"
:
choice
,
"usage"
:
{
"usage"
:
{
...
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
...
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
cached_tokens
=
sum
(
item
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
for
item
in
ret
)
cached_tokens
=
sum
(
item
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
for
item
in
ret
)
response
=
ChatCompletionResponse
(
response
=
ChatCompletionResponse
(
id
=
ret
[
0
][
"meta_info"
][
"id"
],
id
=
ret
[
0
][
"meta_info"
][
"id"
],
created
=
created
,
model
=
request
.
model
,
model
=
request
.
model
,
choices
=
choices
,
choices
=
choices
,
usage
=
UsageInfo
(
usage
=
UsageInfo
(
...
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
...
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
return
response
return
response
async
def
v1_chat_completions
(
tokenizer_manager
,
raw_request
:
Request
):
async
def
v1_chat_completions
(
tokenizer_manager
,
raw_request
:
Request
,
cache_report
=
False
):
request_json
=
await
raw_request
.
json
()
request_json
=
await
raw_request
.
json
()
all_requests
=
[
ChatCompletionRequest
(
**
request_json
)]
all_requests
=
[
ChatCompletionRequest
(
**
request_json
)]
created
=
int
(
time
.
time
())
adapted_request
,
request
=
v1_chat_generate_request
(
all_requests
,
tokenizer_manager
)
adapted_request
,
request
=
v1_chat_generate_request
(
all_requests
,
tokenizer_manager
)
if
adapted_request
.
stream
:
if
adapted_request
.
stream
:
...
@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
n_prev_tokens
=
{}
n_prev_tokens
=
{}
prompt_tokens
=
{}
prompt_tokens
=
{}
completion_tokens
=
{}
completion_tokens
=
{}
cached_tokens
=
{}
try
:
try
:
async
for
content
in
tokenizer_manager
.
generate_request
(
async
for
content
in
tokenizer_manager
.
generate_request
(
adapted_request
,
raw_request
adapted_request
,
raw_request
...
@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
cached_tokens
[
index
]
=
content
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
if
request
.
logprobs
:
if
request
.
logprobs
:
logprobs
=
to_openai_style_logprobs
(
logprobs
=
to_openai_style_logprobs
(
output_token_logprobs
=
content
[
"meta_info"
][
output_token_logprobs
=
content
[
"meta_info"
][
...
@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
total_completion_tokens
=
sum
(
total_completion_tokens
=
sum
(
tokens
for
tokens
in
completion_tokens
.
values
()
tokens
for
tokens
in
completion_tokens
.
values
()
)
)
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
if
cache_report
:
cached_tokens_sum
=
sum
(
tokens
for
tokens
in
cached_tokens
.
values
()
)
prompt_tokens_details
=
{
"cached_tokens"
:
cached_tokens_sum
}
else
:
prompt_tokens_details
=
None
usage
=
UsageInfo
(
usage
=
UsageInfo
(
prompt_tokens
=
total_prompt_tokens
,
prompt_tokens
=
total_prompt_tokens
,
completion_tokens
=
total_completion_tokens
,
completion_tokens
=
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
prompt_tokens_details
=
prompt_tokens_details
,
)
)
final_usage_chunk
=
ChatCompletionStreamResponse
(
final_usage_chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[],
choices
=
[],
model
=
request
.
model
,
model
=
request
.
model
,
usage
=
usage
,
usage
=
usage
,
...
@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
response
=
v1_chat_generate_response
(
response
=
v1_chat_generate_response
(
request
,
request
,
ret
,
ret
,
created
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
reasoning_parser
=
tokenizer_manager
.
server_args
.
reasoning_parser
,
reasoning_parser
=
tokenizer_manager
.
server_args
.
reasoning_parser
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment