Commit ec7e912f authored by swu-hyk's avatar swu-hyk
Browse files

modify

parent 68e7df3a
...@@ -91,8 +91,16 @@ class OllamaChatCompletionRequest(BaseModel): ...@@ -91,8 +91,16 @@ class OllamaChatCompletionRequest(BaseModel):
class OllamaChatCompletionStreamResponse(BaseModel): class OllamaChatCompletionStreamResponse(BaseModel):
model: str model: str
created_at: str created_at: str
message: str message: dict
done: bool = Field(...) done: bool = Field(...)
total_duration: Optional[int] = Field(None, description="Total time spent in nanoseconds")
load_duration: Optional[int] = Field(None, description="Time spent loading model in nanoseconds")
prompt_eval_count: Optional[int] = Field(None, description="Number of tokens in prompt")
prompt_eval_duration: Optional[int] = Field(None, description="Time spent evaluating prompt in nanoseconds")
eval_count: Optional[int] = Field(None, description="Number of tokens generated")
eval_duration: Optional[int] = Field(None, description="Time spent generating response in nanoseconds")
class OllamaChatCompletionResponse(BaseModel): class OllamaChatCompletionResponse(BaseModel):
pass pass
...@@ -111,19 +119,37 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): ...@@ -111,19 +119,37 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
if input.stream: if input.stream:
async def inner(): async def inner():
start_time = time() # 记录开始时间(秒)
eval_count = 0 # 统计生成的 token 数量
tokens = []
async for token in interface.inference(prompt, id): async for token in interface.inference(prompt, id):
d = OllamaChatCompletionStreamResponse( d = OllamaChatCompletionStreamResponse(
model=config.model_name, model=config.model_name,
created_at=str(datetime.now()), created_at=str(datetime.now()),
message=token, message={"role": "assistant", "content": token},
done=False done=False
) )
yield d.model_dump_json() + '\n' yield d.model_dump_json() + '\n'
# 计算性能数据
end_time = time()
total_duration = int((end_time - start_time) * 1_000_000_000) # 转换为纳秒
prompt_eval_count = len(prompt.split()) # 简单估算提示词数量
eval_duration = total_duration # 假设全部时间用于生成(简化)
prompt_eval_duration = 0 # 假设无单独提示评估时间
load_duration = 0 # 假设加载时间未知
d = OllamaChatCompletionStreamResponse( d = OllamaChatCompletionStreamResponse(
model=config.model_name, model=config.model_name,
created_at=str(datetime.now()), created_at=str(datetime.now()),
message='', message={},
done=True done=True,
total_duration=total_duration,
load_duration=load_duration,
prompt_eval_count=prompt_eval_count,
prompt_eval_duration=prompt_eval_duration,
eval_count=eval_count,
eval_duration=eval_duration
) )
yield d.model_dump_json() + '\n' yield d.model_dump_json() + '\n'
return check_link_response(request, inner()) return check_link_response(request, inner())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment