Unverified Commit 417fc72f authored by Yuhong Guo's avatar Yuhong Guo Committed by GitHub
Browse files

Align completion and chat_completion response to OpenAI API (#4637)

parent c6ec7029
...@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe ...@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
) )
try: try:
created = int(time.time())
ret = await tokenizer_manager.generate_request(adapted_request).__anext__() ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
if not isinstance(ret, list): if not isinstance(ret, list):
ret = [ret] ret = [ret]
...@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe ...@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
responses = v1_chat_generate_response( responses = v1_chat_generate_response(
request, request,
ret, ret,
created,
to_file=True, to_file=True,
cache_report=tokenizer_manager.server_args.enable_cache_report, cache_report=tokenizer_manager.server_args.enable_cache_report,
tool_call_parser=tokenizer_manager.server_args.tool_call_parser, tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
) )
else: else:
responses = v1_generate_response( responses = v1_generate_response(
request, ret, tokenizer_manager, to_file=True request,
ret,
tokenizer_manager,
created,
to_file=True,
cache_report=tokenizer_manager.server_args.enable_cache_report,
) )
except Exception as e: except Exception as e:
...@@ -577,7 +584,9 @@ def v1_generate_request( ...@@ -577,7 +584,9 @@ def v1_generate_request(
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0] return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
def v1_generate_response(request, ret, tokenizer_manager, to_file=False): def v1_generate_response(
request, ret, tokenizer_manager, created, to_file=False, cache_report=False
):
choices = [] choices = []
echo = False echo = False
...@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False): ...@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
# remain the same but if needed we can change that # remain the same but if needed we can change that
"id": ret[i]["meta_info"]["id"], "id": ret[i]["meta_info"]["id"],
"object": "text_completion", "object": "text_completion",
"created": int(time.time()), "created": created,
"model": request[i].model, "model": request[i].model,
"choices": choice, "choices": choice,
"usage": { "usage": {
...@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False): ...@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n) ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
) )
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret) completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
response = CompletionResponse( response = CompletionResponse(
id=ret[0]["meta_info"]["id"], id=ret[0]["meta_info"]["id"],
model=request.model, model=request.model,
created=created,
choices=choices, choices=choices,
usage=UsageInfo( usage=UsageInfo(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens, total_tokens=prompt_tokens + completion_tokens,
prompt_tokens_details=(
{"cached_tokens": cached_tokens} if cache_report else None
),
), ),
) )
return response return response
...@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False): ...@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
async def v1_completions(tokenizer_manager, raw_request: Request): async def v1_completions(tokenizer_manager, raw_request: Request):
request_json = await raw_request.json() request_json = await raw_request.json()
all_requests = [CompletionRequest(**request_json)] all_requests = [CompletionRequest(**request_json)]
created = int(time.time())
adapted_request, request = v1_generate_request(all_requests) adapted_request, request = v1_generate_request(all_requests)
if adapted_request.stream: if adapted_request.stream:
...@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request): ...@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
n_prev_tokens = {} n_prev_tokens = {}
prompt_tokens = {} prompt_tokens = {}
completion_tokens = {} completion_tokens = {}
cached_tokens = {}
try: try:
async for content in tokenizer_manager.generate_request( async for content in tokenizer_manager.generate_request(
adapted_request, raw_request adapted_request, raw_request
...@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request): ...@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
text = content["text"] text = content["text"]
prompt_tokens[index] = content["meta_info"]["prompt_tokens"] prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
completion_tokens[index] = content["meta_info"]["completion_tokens"] completion_tokens[index] = content["meta_info"]["completion_tokens"]
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
if not stream_buffer: # The first chunk if not stream_buffer: # The first chunk
if request.echo: if request.echo:
...@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request): ...@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
) )
chunk = CompletionStreamResponse( chunk = CompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
object="text_completion", object="text_completion",
choices=[choice_data], choices=[choice_data],
model=request.model, model=request.model,
...@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request): ...@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
total_completion_tokens = sum( total_completion_tokens = sum(
tokens for tokens in completion_tokens.values() tokens for tokens in completion_tokens.values()
) )
cache_report = tokenizer_manager.server_args.enable_cache_report
if cache_report:
cached_tokens_sum = sum(
tokens for tokens in cached_tokens.values()
)
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
else:
prompt_tokens_details = None
usage = UsageInfo( usage = UsageInfo(
prompt_tokens=total_prompt_tokens, prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens, completion_tokens=total_completion_tokens,
total_tokens=total_prompt_tokens + total_completion_tokens, total_tokens=total_prompt_tokens + total_completion_tokens,
prompt_tokens_details=prompt_tokens_details,
) )
final_usage_chunk = CompletionStreamResponse( final_usage_chunk = CompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[], choices=[],
model=request.model, model=request.model,
usage=usage, usage=usage,
...@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request): ...@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
if not isinstance(ret, list): if not isinstance(ret, list):
ret = [ret] ret = [ret]
response = v1_generate_response(request, ret, tokenizer_manager) response = v1_generate_response(
request,
ret,
tokenizer_manager,
created,
cache_report=tokenizer_manager.server_args.enable_cache_report,
)
return response return response
...@@ -1045,6 +1080,7 @@ def v1_chat_generate_request( ...@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
def v1_chat_generate_response( def v1_chat_generate_response(
request, request,
ret, ret,
created,
to_file=False, to_file=False,
cache_report=False, cache_report=False,
tool_call_parser=None, tool_call_parser=None,
...@@ -1196,7 +1232,7 @@ def v1_chat_generate_response( ...@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
# remain the same but if needed we can change that # remain the same but if needed we can change that
"id": ret[i]["meta_info"]["id"], "id": ret[i]["meta_info"]["id"],
"object": "chat.completion", "object": "chat.completion",
"created": int(time.time()), "created": created,
"model": request[i].model, "model": request[i].model,
"choices": choice, "choices": choice,
"usage": { "usage": {
...@@ -1218,6 +1254,7 @@ def v1_chat_generate_response( ...@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret) cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
response = ChatCompletionResponse( response = ChatCompletionResponse(
id=ret[0]["meta_info"]["id"], id=ret[0]["meta_info"]["id"],
created=created,
model=request.model, model=request.model,
choices=choices, choices=choices,
usage=UsageInfo( usage=UsageInfo(
...@@ -1232,9 +1269,12 @@ def v1_chat_generate_response( ...@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
return response return response
async def v1_chat_completions(tokenizer_manager, raw_request: Request): async def v1_chat_completions(
tokenizer_manager, raw_request: Request, cache_report=False
):
request_json = await raw_request.json() request_json = await raw_request.json()
all_requests = [ChatCompletionRequest(**request_json)] all_requests = [ChatCompletionRequest(**request_json)]
created = int(time.time())
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager) adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
if adapted_request.stream: if adapted_request.stream:
...@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
n_prev_tokens = {} n_prev_tokens = {}
prompt_tokens = {} prompt_tokens = {}
completion_tokens = {} completion_tokens = {}
cached_tokens = {}
try: try:
async for content in tokenizer_manager.generate_request( async for content in tokenizer_manager.generate_request(
adapted_request, raw_request adapted_request, raw_request
...@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
prompt_tokens[index] = content["meta_info"]["prompt_tokens"] prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
completion_tokens[index] = content["meta_info"]["completion_tokens"] completion_tokens[index] = content["meta_info"]["completion_tokens"]
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
if request.logprobs: if request.logprobs:
logprobs = to_openai_style_logprobs( logprobs = to_openai_style_logprobs(
output_token_logprobs=content["meta_info"][ output_token_logprobs=content["meta_info"][
...@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
) )
chunk = ChatCompletionStreamResponse( chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[choice_data], choices=[choice_data],
model=request.model, model=request.model,
) )
...@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
) )
chunk = ChatCompletionStreamResponse( chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[choice_data], choices=[choice_data],
model=request.model, model=request.model,
) )
...@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
) )
chunk = ChatCompletionStreamResponse( chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[choice_data], choices=[choice_data],
model=request.model, model=request.model,
) )
...@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
) )
chunk = ChatCompletionStreamResponse( chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[choice_data], choices=[choice_data],
model=request.model, model=request.model,
) )
...@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
) )
chunk = ChatCompletionStreamResponse( chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[choice_data], choices=[choice_data],
model=request.model, model=request.model,
) )
...@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
total_completion_tokens = sum( total_completion_tokens = sum(
tokens for tokens in completion_tokens.values() tokens for tokens in completion_tokens.values()
) )
cache_report = tokenizer_manager.server_args.enable_cache_report
if cache_report:
cached_tokens_sum = sum(
tokens for tokens in cached_tokens.values()
)
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
else:
prompt_tokens_details = None
usage = UsageInfo( usage = UsageInfo(
prompt_tokens=total_prompt_tokens, prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens, completion_tokens=total_completion_tokens,
total_tokens=total_prompt_tokens + total_completion_tokens, total_tokens=total_prompt_tokens + total_completion_tokens,
prompt_tokens_details=prompt_tokens_details,
) )
final_usage_chunk = ChatCompletionStreamResponse( final_usage_chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"], id=content["meta_info"]["id"],
created=created,
choices=[], choices=[],
model=request.model, model=request.model,
usage=usage, usage=usage,
...@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): ...@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
response = v1_chat_generate_response( response = v1_chat_generate_response(
request, request,
ret, ret,
created,
cache_report=tokenizer_manager.server_args.enable_cache_report, cache_report=tokenizer_manager.server_args.enable_cache_report,
tool_call_parser=tokenizer_manager.server_args.tool_call_parser, tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
reasoning_parser=tokenizer_manager.server_args.reasoning_parser, reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment