Unverified Commit 4c8d1bf3 authored by Augusto Yao's avatar Augusto Yao Committed by GitHub
Browse files

use ORJSONResponse when available to improve the efficiency of request process (#33548)


Signed-off-by: default avataraugusto.yjh <augusto.yjh@antgroup.com>
parent 061da6bc
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from functools import lru_cache
from http import HTTPStatus
from fastapi import APIRouter, Depends, Request
......@@ -15,9 +17,24 @@ from vllm.entrypoints.pooling.embed.protocol import (
)
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.logger import init_logger
router = APIRouter()
logger = init_logger(__name__)
@lru_cache(maxsize=1)
def _get_json_response_cls():
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
def embedding(request: Request) -> OpenAIServingEmbedding | None:
return request.app.state.openai_serving_embedding
......@@ -54,7 +71,7 @@ async def create_embedding(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, EmbeddingResponse):
return JSONResponse(content=generator.model_dump())
return _get_json_response_cls()(content=generator.model_dump())
elif isinstance(generator, EmbeddingBytesResponse):
return StreamingResponse(
content=generator.content,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment