"vscode:/vscode.git/clone" did not exist on "33449ee42db14f384901718c5aa1524ad6482ff6"
Unverified Commit 8ce34b3b authored by Yuhao Tsui's avatar Yuhao Tsui Committed by GitHub
Browse files

Modify the performance calculation module

Modify the performance data calculation module from estimation to retrieving from `raw_usage`.
parent 6e4da83d
...@@ -115,6 +115,7 @@ class OllamaChatCompletionStreamResponse(BaseModel): ...@@ -115,6 +115,7 @@ class OllamaChatCompletionStreamResponse(BaseModel):
created_at: str created_at: str
message: dict message: dict
done: bool = Field(...) done: bool = Field(...)
done_reason: Optional[str] = Field("", description="done_reason")
total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds") total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds") load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt") prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
...@@ -127,6 +128,7 @@ class OllamaChatCompletionResponse(BaseModel): ...@@ -127,6 +128,7 @@ class OllamaChatCompletionResponse(BaseModel):
created_at: str created_at: str
message: dict message: dict
done: bool done: bool
done_reason: Optional[str] = Field("", description="done_reason")
total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds") total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds") load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt") prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
...@@ -140,19 +142,14 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): ...@@ -140,19 +142,14 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
interface: BackendInterfaceBase = get_interface() interface: BackendInterfaceBase = get_interface()
config = Config() config = Config()
# 将消息转换为提示字符串 input_message = [json.loads(m.model_dump_json()) for m in input.messages]
prompt = ""
for msg in input.messages:
prompt += f"{msg.role}: {msg.content}\n"
prompt += "assistant:"
if input.stream: if input.stream:
async def inner(): async def inner():
start_time = time() # 记录开始时间(秒) start_time = time() # 记录开始时间(秒)
eval_count = 0 # 统计生成的 token 数量
tokens = [] tokens = []
async for res in interface.inference(prompt, id): async for res in interface.inference(input_message, id):
if isinstance(res, RawUsage): if isinstance(res, RawUsage):
raw_usage = res raw_usage = res
else: else:
...@@ -166,11 +163,13 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): ...@@ -166,11 +163,13 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
yield d.model_dump_json() + '\n' yield d.model_dump_json() + '\n'
# 计算性能数据 # 计算性能数据
end_time = time() end_time = time()
total_duration = int((end_time - start_time) * 1_000_000_000) # 转换为纳秒 total_duration = int((end_time - start_time) * 1_000_000_000) # unit: ns
prompt_eval_count = len(prompt.split()) # 简单估算提示词数量 prompt_eval_count = raw_usage.prefill_count
eval_duration = total_duration # 假设全部时间用于生成(简化) eval_count = raw_usage.decode_count
prompt_eval_duration = 0 # 假设无单独提示评估时间 eval_duration = int(raw_usage.decode_time * 1_000_000_000)
load_duration = 0 # 假设加载时间未知 prompt_eval_duration = int(raw_usage.prefill_time * 1_000_000_000)
load_duration = int(raw_usage.tokenize_time * 1_000_000_000)
done_reason = finish_reason
d = OllamaChatCompletionStreamResponse( d = OllamaChatCompletionStreamResponse(
model=config.model_name, model=config.model_name,
...@@ -182,7 +181,8 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): ...@@ -182,7 +181,8 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
prompt_eval_count=prompt_eval_count, prompt_eval_count=prompt_eval_count,
prompt_eval_duration=prompt_eval_duration, prompt_eval_duration=prompt_eval_duration,
eval_count=eval_count, eval_count=eval_count,
eval_duration=eval_duration eval_duration=eval_duration,
done_reason=done_reason
) )
yield d.model_dump_json() + '\n' yield d.model_dump_json() + '\n'
return check_link_response(request, inner()) return check_link_response(request, inner())
...@@ -191,20 +191,22 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): ...@@ -191,20 +191,22 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
complete_response = "" complete_response = ""
eval_count = 0 eval_count = 0
async for res in interface.inference(prompt, id): async for res in interface.inference(input_message, id):
if isinstance(res, RawUsage): if isinstance(res, RawUsage):
raw_usage = res raw_usage = res
else: else:
token, finish_reason = res token, finish_reason = res
complete_response += token complete_response += token
eval_count += 1
end_time = time() end_time = time()
total_duration = int((end_time - start_time) * 1_000_000_000) total_duration = int((end_time - start_time) * 1_000_000_000) # unit: ns
prompt_eval_count = len(prompt.split()) prompt_eval_count = raw_usage.prefill_count
eval_duration = total_duration eval_count = raw_usage.decode_count
prompt_eval_duration = 0 eval_duration = int(raw_usage.decode_time * 1_000_000_000)
load_duration = 0 prompt_eval_duration = int(raw_usage.prefill_time * 1_000_000_000)
load_duration = int(raw_usage.tokenize_time * 1_000_000_000)
done_reason = finish_reason
response = OllamaChatCompletionResponse( response = OllamaChatCompletionResponse(
model=config.model_name, model=config.model_name,
...@@ -216,7 +218,8 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): ...@@ -216,7 +218,8 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
prompt_eval_count=prompt_eval_count, prompt_eval_count=prompt_eval_count,
prompt_eval_duration=prompt_eval_duration, prompt_eval_duration=prompt_eval_duration,
eval_count=eval_count, eval_count=eval_count,
eval_duration=eval_duration eval_duration=eval_duration,
done_reason=done_reason
) )
return response return response
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment