Unverified Commit e5db40dc authored by Michael Feil's avatar Michael Feil Committed by GitHub
Browse files

ORJson. Faster Json serialization (#1694)

parent b1709305
......@@ -28,7 +28,9 @@ import os
import threading
import time
from http import HTTPStatus
from typing import Dict, List, Optional, Union
from typing import AsyncIterator, Dict, List, Optional, Union
import orjson
# Fix a bug of Python threading
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
......@@ -192,14 +194,18 @@ async def generate_request(obj: GenerateReqInput, request: Request):
"""Handle a generate request."""
if obj.stream:
async def stream_results():
async def stream_results() -> AsyncIterator[bytes]:
try:
async for out in tokenizer_manager.generate_request(obj, request):
yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n"
yield b"data: " + orjson.dumps(
out, option=orjson.OPT_NON_STR_KEYS
) + b"\n\n"
except ValueError as e:
out = {"error": {"message": str(e)}}
yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n"
yield "data: [DONE]\n\n"
yield b"data: " + orjson.dumps(
out, option=orjson.OPT_NON_STR_KEYS
) + b"\n\n"
yield b"data: [DONE]\n\n"
return StreamingResponse(
stream_results(),
......@@ -260,13 +266,13 @@ async def openai_v1_chat_completions(raw_request: Request):
return await v1_chat_completions(tokenizer_manager, raw_request)
@app.post("/v1/embeddings")
@app.post("/v1/embeddings", response_class=ORJSONResponse)
async def openai_v1_embeddings(raw_request: Request):
response = await v1_embeddings(tokenizer_manager, raw_request)
return response
@app.get("/v1/models")
@app.get("/v1/models", response_class=ORJSONResponse)
def available_models():
"""Show available models."""
served_model_names = [tokenizer_manager.served_model_name]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment