Unverified Commit 00e6402d authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Frontend] track responsesAPI server_load (#32323)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent ce094624
...@@ -259,6 +259,10 @@ def engine_client(request: Request) -> EngineClient: ...@@ -259,6 +259,10 @@ def engine_client(request: Request) -> EngineClient:
async def get_server_load_metrics(request: Request): async def get_server_load_metrics(request: Request):
# This endpoint returns the current server load metrics. # This endpoint returns the current server load metrics.
# It tracks requests utilizing the GPU from the following routes: # It tracks requests utilizing the GPU from the following routes:
# - /v1/responses
# - /v1/responses/{response_id}
# - /v1/responses/{response_id}/cancel
# - /v1/messages
# - /v1/chat/completions # - /v1/chat/completions
# - /v1/completions # - /v1/completions
# - /v1/audio/transcriptions # - /v1/audio/transcriptions
......
...@@ -17,6 +17,7 @@ from vllm.entrypoints.openai.responses.protocol import ( ...@@ -17,6 +17,7 @@ from vllm.entrypoints.openai.responses.protocol import (
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import ( from vllm.entrypoints.utils import (
load_aware_call,
with_cancellation, with_cancellation,
) )
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -54,6 +55,7 @@ async def _convert_stream_to_sse_events( ...@@ -54,6 +55,7 @@ async def _convert_stream_to_sse_events(
}, },
) )
@with_cancellation @with_cancellation
@load_aware_call
async def create_responses(request: ResponsesRequest, raw_request: Request): async def create_responses(request: ResponsesRequest, raw_request: Request):
handler = responses(raw_request) handler = responses(raw_request)
if handler is None: if handler is None:
...@@ -79,6 +81,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): ...@@ -79,6 +81,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
@router.get("/v1/responses/{response_id}") @router.get("/v1/responses/{response_id}")
@load_aware_call
async def retrieve_responses( async def retrieve_responses(
response_id: str, response_id: str,
raw_request: Request, raw_request: Request,
...@@ -113,6 +116,7 @@ async def retrieve_responses( ...@@ -113,6 +116,7 @@ async def retrieve_responses(
@router.post("/v1/responses/{response_id}/cancel") @router.post("/v1/responses/{response_id}/cancel")
@load_aware_call
async def cancel_responses(response_id: str, raw_request: Request): async def cancel_responses(response_id: str, raw_request: Request):
handler = responses(raw_request) handler = responses(raw_request)
if handler is None: if handler is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment