Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
417fc72f
"vscode:/vscode.git/clone" did not exist on "be54a95b93f16aa2b4f29f6bc7c1e6e753b4514b"
Unverified
Commit
417fc72f
authored
Mar 21, 2025
by
Yuhong Guo
Committed by
GitHub
Mar 20, 2025
Browse files
Align completion and chat_completion response to OpenAI API (#4637)
parent
c6ec7029
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
64 additions
and
6 deletions
+64
-6
python/sglang/srt/openai_api/adapter.py
python/sglang/srt/openai_api/adapter.py
+64
-6
No files found.
python/sglang/srt/openai_api/adapter.py
View file @
417fc72f
...
@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
...
@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
)
)
try
:
try
:
created
=
int
(
time
.
time
())
ret
=
await
tokenizer_manager
.
generate_request
(
adapted_request
).
__anext__
()
ret
=
await
tokenizer_manager
.
generate_request
(
adapted_request
).
__anext__
()
if
not
isinstance
(
ret
,
list
):
if
not
isinstance
(
ret
,
list
):
ret
=
[
ret
]
ret
=
[
ret
]
...
@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
...
@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
responses
=
v1_chat_generate_response
(
responses
=
v1_chat_generate_response
(
request
,
request
,
ret
,
ret
,
created
,
to_file
=
True
,
to_file
=
True
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
)
)
else
:
else
:
responses
=
v1_generate_response
(
responses
=
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
to_file
=
True
request
,
ret
,
tokenizer_manager
,
created
,
to_file
=
True
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
)
)
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -577,7 +584,9 @@ def v1_generate_request(
...
@@ -577,7 +584,9 @@ def v1_generate_request(
return
adapted_request
,
all_requests
if
len
(
all_requests
)
>
1
else
all_requests
[
0
]
return
adapted_request
,
all_requests
if
len
(
all_requests
)
>
1
else
all_requests
[
0
]
def
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
to_file
=
False
):
def
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
created
,
to_file
=
False
,
cache_report
=
False
):
choices
=
[]
choices
=
[]
echo
=
False
echo
=
False
...
@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
...
@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
# remain the same but if needed we can change that
# remain the same but if needed we can change that
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"object"
:
"text_completion"
,
"object"
:
"text_completion"
,
"created"
:
int
(
time
.
time
())
,
"created"
:
created
,
"model"
:
request
[
i
].
model
,
"model"
:
request
[
i
].
model
,
"choices"
:
choice
,
"choices"
:
choice
,
"usage"
:
{
"usage"
:
{
...
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
...
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
ret
[
i
][
"meta_info"
][
"prompt_tokens"
]
for
i
in
range
(
0
,
len
(
ret
),
request
.
n
)
ret
[
i
][
"meta_info"
][
"prompt_tokens"
]
for
i
in
range
(
0
,
len
(
ret
),
request
.
n
)
)
)
completion_tokens
=
sum
(
item
[
"meta_info"
][
"completion_tokens"
]
for
item
in
ret
)
completion_tokens
=
sum
(
item
[
"meta_info"
][
"completion_tokens"
]
for
item
in
ret
)
cached_tokens
=
sum
(
item
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
for
item
in
ret
)
response
=
CompletionResponse
(
response
=
CompletionResponse
(
id
=
ret
[
0
][
"meta_info"
][
"id"
],
id
=
ret
[
0
][
"meta_info"
][
"id"
],
model
=
request
.
model
,
model
=
request
.
model
,
created
=
created
,
choices
=
choices
,
choices
=
choices
,
usage
=
UsageInfo
(
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
prompt_tokens_details
=
(
{
"cached_tokens"
:
cached_tokens
}
if
cache_report
else
None
),
),
),
)
)
return
response
return
response
...
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
...
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
async
def
v1_completions
(
tokenizer_manager
,
raw_request
:
Request
):
async
def
v1_completions
(
tokenizer_manager
,
raw_request
:
Request
):
request_json
=
await
raw_request
.
json
()
request_json
=
await
raw_request
.
json
()
all_requests
=
[
CompletionRequest
(
**
request_json
)]
all_requests
=
[
CompletionRequest
(
**
request_json
)]
created
=
int
(
time
.
time
())
adapted_request
,
request
=
v1_generate_request
(
all_requests
)
adapted_request
,
request
=
v1_generate_request
(
all_requests
)
if
adapted_request
.
stream
:
if
adapted_request
.
stream
:
...
@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
n_prev_tokens
=
{}
n_prev_tokens
=
{}
prompt_tokens
=
{}
prompt_tokens
=
{}
completion_tokens
=
{}
completion_tokens
=
{}
cached_tokens
=
{}
try
:
try
:
async
for
content
in
tokenizer_manager
.
generate_request
(
async
for
content
in
tokenizer_manager
.
generate_request
(
adapted_request
,
raw_request
adapted_request
,
raw_request
...
@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
text
=
content
[
"text"
]
text
=
content
[
"text"
]
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
cached_tokens
[
index
]
=
content
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
if
not
stream_buffer
:
# The first chunk
if
not
stream_buffer
:
# The first chunk
if
request
.
echo
:
if
request
.
echo
:
...
@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
CompletionStreamResponse
(
chunk
=
CompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
object
=
"text_completion"
,
object
=
"text_completion"
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
...
@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
total_completion_tokens
=
sum
(
total_completion_tokens
=
sum
(
tokens
for
tokens
in
completion_tokens
.
values
()
tokens
for
tokens
in
completion_tokens
.
values
()
)
)
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
if
cache_report
:
cached_tokens_sum
=
sum
(
tokens
for
tokens
in
cached_tokens
.
values
()
)
prompt_tokens_details
=
{
"cached_tokens"
:
cached_tokens_sum
}
else
:
prompt_tokens_details
=
None
usage
=
UsageInfo
(
usage
=
UsageInfo
(
prompt_tokens
=
total_prompt_tokens
,
prompt_tokens
=
total_prompt_tokens
,
completion_tokens
=
total_completion_tokens
,
completion_tokens
=
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
prompt_tokens_details
=
prompt_tokens_details
,
)
)
final_usage_chunk
=
CompletionStreamResponse
(
final_usage_chunk
=
CompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[],
choices
=
[],
model
=
request
.
model
,
model
=
request
.
model
,
usage
=
usage
,
usage
=
usage
,
...
@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
...
@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
if
not
isinstance
(
ret
,
list
):
if
not
isinstance
(
ret
,
list
):
ret
=
[
ret
]
ret
=
[
ret
]
response
=
v1_generate_response
(
request
,
ret
,
tokenizer_manager
)
response
=
v1_generate_response
(
request
,
ret
,
tokenizer_manager
,
created
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
)
return
response
return
response
...
@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
...
@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
def
v1_chat_generate_response
(
def
v1_chat_generate_response
(
request
,
request
,
ret
,
ret
,
created
,
to_file
=
False
,
to_file
=
False
,
cache_report
=
False
,
cache_report
=
False
,
tool_call_parser
=
None
,
tool_call_parser
=
None
,
...
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
...
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
# remain the same but if needed we can change that
# remain the same but if needed we can change that
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"id"
:
ret
[
i
][
"meta_info"
][
"id"
],
"object"
:
"chat.completion"
,
"object"
:
"chat.completion"
,
"created"
:
int
(
time
.
time
())
,
"created"
:
created
,
"model"
:
request
[
i
].
model
,
"model"
:
request
[
i
].
model
,
"choices"
:
choice
,
"choices"
:
choice
,
"usage"
:
{
"usage"
:
{
...
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
...
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
cached_tokens
=
sum
(
item
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
for
item
in
ret
)
cached_tokens
=
sum
(
item
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
for
item
in
ret
)
response
=
ChatCompletionResponse
(
response
=
ChatCompletionResponse
(
id
=
ret
[
0
][
"meta_info"
][
"id"
],
id
=
ret
[
0
][
"meta_info"
][
"id"
],
created
=
created
,
model
=
request
.
model
,
model
=
request
.
model
,
choices
=
choices
,
choices
=
choices
,
usage
=
UsageInfo
(
usage
=
UsageInfo
(
...
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
...
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
return
response
return
response
async
def
v1_chat_completions
(
tokenizer_manager
,
raw_request
:
Request
):
async
def
v1_chat_completions
(
tokenizer_manager
,
raw_request
:
Request
,
cache_report
=
False
):
request_json
=
await
raw_request
.
json
()
request_json
=
await
raw_request
.
json
()
all_requests
=
[
ChatCompletionRequest
(
**
request_json
)]
all_requests
=
[
ChatCompletionRequest
(
**
request_json
)]
created
=
int
(
time
.
time
())
adapted_request
,
request
=
v1_chat_generate_request
(
all_requests
,
tokenizer_manager
)
adapted_request
,
request
=
v1_chat_generate_request
(
all_requests
,
tokenizer_manager
)
if
adapted_request
.
stream
:
if
adapted_request
.
stream
:
...
@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
n_prev_tokens
=
{}
n_prev_tokens
=
{}
prompt_tokens
=
{}
prompt_tokens
=
{}
completion_tokens
=
{}
completion_tokens
=
{}
cached_tokens
=
{}
try
:
try
:
async
for
content
in
tokenizer_manager
.
generate_request
(
async
for
content
in
tokenizer_manager
.
generate_request
(
adapted_request
,
raw_request
adapted_request
,
raw_request
...
@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
prompt_tokens
[
index
]
=
content
[
"meta_info"
][
"prompt_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
completion_tokens
[
index
]
=
content
[
"meta_info"
][
"completion_tokens"
]
cached_tokens
[
index
]
=
content
[
"meta_info"
].
get
(
"cached_tokens"
,
0
)
if
request
.
logprobs
:
if
request
.
logprobs
:
logprobs
=
to_openai_style_logprobs
(
logprobs
=
to_openai_style_logprobs
(
output_token_logprobs
=
content
[
"meta_info"
][
output_token_logprobs
=
content
[
"meta_info"
][
...
@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
)
)
chunk
=
ChatCompletionStreamResponse
(
chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[
choice_data
],
choices
=
[
choice_data
],
model
=
request
.
model
,
model
=
request
.
model
,
)
)
...
@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
total_completion_tokens
=
sum
(
total_completion_tokens
=
sum
(
tokens
for
tokens
in
completion_tokens
.
values
()
tokens
for
tokens
in
completion_tokens
.
values
()
)
)
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
if
cache_report
:
cached_tokens_sum
=
sum
(
tokens
for
tokens
in
cached_tokens
.
values
()
)
prompt_tokens_details
=
{
"cached_tokens"
:
cached_tokens_sum
}
else
:
prompt_tokens_details
=
None
usage
=
UsageInfo
(
usage
=
UsageInfo
(
prompt_tokens
=
total_prompt_tokens
,
prompt_tokens
=
total_prompt_tokens
,
completion_tokens
=
total_completion_tokens
,
completion_tokens
=
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
,
prompt_tokens_details
=
prompt_tokens_details
,
)
)
final_usage_chunk
=
ChatCompletionStreamResponse
(
final_usage_chunk
=
ChatCompletionStreamResponse
(
id
=
content
[
"meta_info"
][
"id"
],
id
=
content
[
"meta_info"
][
"id"
],
created
=
created
,
choices
=
[],
choices
=
[],
model
=
request
.
model
,
model
=
request
.
model
,
usage
=
usage
,
usage
=
usage
,
...
@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
...
@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
response
=
v1_chat_generate_response
(
response
=
v1_chat_generate_response
(
request
,
request
,
ret
,
ret
,
created
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
cache_report
=
tokenizer_manager
.
server_args
.
enable_cache_report
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
tool_call_parser
=
tokenizer_manager
.
server_args
.
tool_call_parser
,
reasoning_parser
=
tokenizer_manager
.
server_args
.
reasoning_parser
,
reasoning_parser
=
tokenizer_manager
.
server_args
.
reasoning_parser
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment