Unverified Commit fe714dd5 authored by Ning Xie's avatar Ning Xie Committed by GitHub
Browse files

[openapi server] log exception in exception handler(2/N) (#36201)


Signed-off-by: default avatarAndy Xie <andy.xning@gmail.com>
parent 8ab3d742
...@@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path): ...@@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
invalid_files.mkdir() invalid_files.mkdir()
(invalid_files / "adapter_config.json").write_text("this is not json") (invalid_files / "adapter_config.json").write_text("this is not json")
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.InternalServerError):
await client.post( await client.post(
"load_lora_adapter", "load_lora_adapter",
cast_to=str, cast_to=str,
...@@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests( ...@@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests(
json.dump(adapter_config, f) json.dump(adapter_config, f)
# Test loading the adapter # Test loading the adapter
with pytest.raises(openai.BadRequestError, match=expected_error): with pytest.raises(openai.InternalServerError, match=expected_error):
await client.post( await client.post(
"load_lora_adapter", "load_lora_adapter",
cast_to=str, cast_to=str,
...@@ -312,7 +312,7 @@ async def test_loading_invalid_adapters_does_not_break_others( ...@@ -312,7 +312,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"}, body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
) )
for _ in range(25): for _ in range(25):
with suppress(openai.BadRequestError): with suppress(openai.InternalServerError):
await client.post( await client.post(
"load_lora_adapter", "load_lora_adapter",
cast_to=str, cast_to=str,
......
...@@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files( ...@@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files(
basic_server_with_lora.url_for("adapters"), basic_server_with_lora.url_for("adapters"),
json={"name": "invalid-adapter", "src": str(invalid_files)}, json={"name": "invalid-adapter", "src": str(invalid_files)},
) )
assert load_response.status_code == 400 assert load_response.status_code == 500
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques ...@@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization base_server = raw_request.app.state.openai_serving_tokenization
error = base_server.create_error_response( error = base_server.create_error_response(
message="The model does not support Messages API" NotImplementedError("The model does not support Messages API")
) )
return translate_error_response(error) return translate_error_response(error)
...@@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques ...@@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization base_server = raw_request.app.state.openai_serving_tokenization
error = base_server.create_error_response( error = base_server.create_error_response(
message="The model does not support Messages API" NotImplementedError("The model does not support Messages API")
) )
return translate_error_response(error) return translate_error_response(error)
......
...@@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re ...@@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
) )
handler = chat(raw_request) handler = chat(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Chat Completions API")
return base_server.create_error_response(
message="The model does not support Chat Completions API"
)
generator = await handler.create_chat_completion(request, raw_request) generator = await handler.create_chat_completion(request, raw_request)
......
...@@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): ...@@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
) )
handler = completion(raw_request) handler = completion(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Completions API")
return base_server.create_error_response(
message="The model does not support Completions API"
)
generator = await handler.create_completion(request, raw_request) generator = await handler.create_completion(request, raw_request)
......
...@@ -7,7 +7,6 @@ from http import HTTPStatus ...@@ -7,7 +7,6 @@ from http import HTTPStatus
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse, ErrorResponse,
ModelCard, ModelCard,
ModelList, ModelList,
...@@ -18,7 +17,8 @@ from vllm.entrypoints.serve.lora.protocol import ( ...@@ -18,7 +17,8 @@ from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest, LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
) )
from vllm.entrypoints.utils import sanitize_message from vllm.entrypoints.utils import create_error_response
from vllm.exceptions import LoRAAdapterNotFoundError
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
...@@ -152,15 +152,15 @@ class OpenAIServingModels: ...@@ -152,15 +152,15 @@ class OpenAIServingModels:
try: try:
await self.engine_client.add_lora(lora_request) await self.engine_client.add_lora(lora_request)
except Exception as e: except Exception as e:
error_type = "BadRequestError" if str(
status_code = HTTPStatus.BAD_REQUEST LoRAAdapterNotFoundError(
if "No adapter found" in str(e): lora_request.lora_name, lora_request.lora_path
error_type = "NotFoundError" )
status_code = HTTPStatus.NOT_FOUND ) in str(e):
raise LoRAAdapterNotFoundError(
return create_error_response( lora_request.lora_name, lora_request.lora_path
message=str(e), err_type=error_type, status_code=status_code ) from e
) raise
self.lora_requests[lora_name] = lora_request self.lora_requests[lora_name] = lora_request
logger.info( logger.info(
...@@ -292,17 +292,3 @@ class OpenAIServingModels: ...@@ -292,17 +292,3 @@ class OpenAIServingModels:
err_type="NotFoundError", err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND, status_code=HTTPStatus.NOT_FOUND,
) )
def create_error_response(
message: str,
err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse:
return ErrorResponse(
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
)
)
...@@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events( ...@@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events(
async def create_responses(request: ResponsesRequest, raw_request: Request): async def create_responses(request: ResponsesRequest, raw_request: Request):
handler = responses(raw_request) handler = responses(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Responses API")
return base_server.create_error_response(
message="The model does not support Responses API"
)
generator = await handler.create_responses(request, raw_request) generator = await handler.create_responses(request, raw_request)
...@@ -88,10 +85,7 @@ async def retrieve_responses( ...@@ -88,10 +85,7 @@ async def retrieve_responses(
): ):
handler = responses(raw_request) handler = responses(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Responses API")
return base_server.create_error_response(
message="The model does not support Responses API"
)
response = await handler.retrieve_responses( response = await handler.retrieve_responses(
response_id, response_id,
...@@ -115,10 +109,7 @@ async def retrieve_responses( ...@@ -115,10 +109,7 @@ async def retrieve_responses(
async def cancel_responses(response_id: str, raw_request: Request): async def cancel_responses(response_id: str, raw_request: Request):
handler = responses(raw_request) handler = responses(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Responses API")
return base_server.create_error_response(
message="The model does not support Responses API"
)
response = await handler.cancel_responses(response_id) response = await handler.cancel_responses(response_id)
......
...@@ -65,10 +65,7 @@ async def create_transcriptions( ...@@ -65,10 +65,7 @@ async def create_transcriptions(
): ):
handler = transcription(raw_request) handler = transcription(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Transcriptions API")
return base_server.create_error_response(
message="The model does not support Transcriptions API"
)
audio_data = await request.file.read() audio_data = await request.file.read()
...@@ -101,10 +98,7 @@ async def create_translations( ...@@ -101,10 +98,7 @@ async def create_translations(
): ):
handler = translation(raw_request) handler = translation(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Translations API")
return base_server.create_error_response(
message="The model does not support Translations API"
)
audio_data = await request.file.read() audio_data = await request.file.read()
......
...@@ -2,13 +2,12 @@ ...@@ -2,13 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, Depends, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, Response from fastapi.responses import Response
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.utils import ( from vllm.entrypoints.utils import (
create_error_response,
load_aware_call, load_aware_call,
with_cancellation, with_cancellation,
) )
...@@ -28,12 +27,6 @@ async def create_classify( ...@@ -28,12 +27,6 @@ async def create_classify(
) -> Response: ) -> Response:
handler = classify(raw_request) handler = classify(raw_request)
if handler is None: if handler is None:
error_response = create_error_response( raise NotImplementedError("The model does not support Classification API")
message="The model does not support Classification API"
)
return JSONResponse(
content=error_response.model_dump(),
status_code=error_response.error.code,
)
return await handler(request, raw_request) return await handler(request, raw_request)
...@@ -4,14 +4,12 @@ ...@@ -4,14 +4,12 @@
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.utils import ( from vllm.entrypoints.utils import (
create_error_response,
load_aware_call, load_aware_call,
with_cancellation, with_cancellation,
) )
...@@ -39,11 +37,6 @@ async def create_embedding( ...@@ -39,11 +37,6 @@ async def create_embedding(
): ):
handler = embedding(raw_request) handler = embedding(raw_request)
if handler is None: if handler is None:
error_response = create_error_response( raise NotImplementedError("The model does not support Embeddings API")
message="The model does not support Embeddings API"
)
return JSONResponse(
content=error_response.model_dump(),
status_code=error_response.error.code,
)
return await handler(request, raw_request) return await handler(request, raw_request)
...@@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None: ...@@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None:
async def create_pooling(request: PoolingRequest, raw_request: Request): async def create_pooling(request: PoolingRequest, raw_request: Request):
handler = pooling(raw_request) handler = pooling(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Pooling API")
return base_server.create_error_response(
message="The model does not support Pooling API"
)
generator = await handler.create_pooling(request, raw_request) generator = await handler.create_pooling(request, raw_request)
......
...@@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None: ...@@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None:
async def create_score(request: ScoreRequest, raw_request: Request): async def create_score(request: ScoreRequest, raw_request: Request):
handler = score(raw_request) handler = score(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Score API")
return base_server.create_error_response(
message="The model does not support Score API"
)
generator = await handler.create_score(request, raw_request) generator = await handler.create_score(request, raw_request)
...@@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): ...@@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
async def do_rerank(request: RerankRequest, raw_request: Request): async def do_rerank(request: RerankRequest, raw_request: Request):
handler = rerank(raw_request) handler = rerank(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization raise NotImplementedError("The model does not support Rerank (Score) API")
return base_server.create_error_response(
message="The model does not support Rerank (Score) API"
)
generator = await handler.do_rerank(request, raw_request) generator = await handler.do_rerank(request, raw_request)
......
...@@ -61,9 +61,7 @@ router = APIRouter() ...@@ -61,9 +61,7 @@ router = APIRouter()
async def generate(request: GenerateRequest, raw_request: Request): async def generate(request: GenerateRequest, raw_request: Request):
handler = generate_tokens(raw_request) handler = generate_tokens(raw_request)
if handler is None: if handler is None:
return tokenization(raw_request).create_error_response( raise NotImplementedError("The model does not support generate tokens API")
message="The model does not support generate tokens API"
)
generator = await handler.serve_tokens(request, raw_request) generator = await handler.serve_tokens(request, raw_request)
......
...@@ -10,7 +10,6 @@ from vllm.entrypoints.openai.completion.protocol import CompletionRequest ...@@ -10,7 +10,6 @@ from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.render.serving import OpenAIServingRender from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.utils import create_error_response
from vllm.logger import init_logger from vllm.logger import init_logger
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None: ...@@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None:
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request): async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
handler = render(raw_request) handler = render(raw_request)
if handler is None: if handler is None:
error = create_error_response( raise NotImplementedError(
message="The model does not support Chat Completions Render API", "The model does not support Chat Completions Render API"
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
)
return JSONResponse(
status_code=HTTPStatus.NOT_FOUND, content=error.model_dump()
) )
result = await handler.render_chat_request(request) result = await handler.render_chat_request(request)
...@@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re ...@@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
async def render_completion(request: CompletionRequest, raw_request: Request): async def render_completion(request: CompletionRequest, raw_request: Request):
handler = render(raw_request) handler = render(raw_request)
if handler is None: if handler is None:
error = create_error_response( raise NotImplementedError("The model does not support Completions Render API")
message="The model does not support Completions Render API",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
)
return JSONResponse(
status_code=HTTPStatus.NOT_FOUND, content=error.model_dump()
)
result = await handler.render_completion_request(request) result = await handler.render_completion_request(request)
......
...@@ -36,7 +36,31 @@ class VLLMValidationError(ValueError): ...@@ -36,7 +36,31 @@ class VLLMValidationError(ValueError):
return f"{base} ({', '.join(extras)})" if extras else base return f"{base} ({', '.join(extras)})" if extras else base
class VLLMNotFoundError(ValueError): class VLLMNotFoundError(Exception):
"""vLLM-specific NotFoundError""" """vLLM-specific NotFoundError"""
pass pass
class LoRAAdapterNotFoundError(VLLMNotFoundError):
"""Exception raised when a LoRA adapter is not found.
This exception is thrown when a requested LoRA adapter does not exist
in the system.
Attributes:
message: The error message string describing the exception
"""
message: str
def __init__(
self,
lora_name: str,
lora_path: str,
) -> None:
message = f"Loading lora {lora_name} failed: No adapter found for {lora_path}"
self.message = message
def __str__(self):
return self.message
...@@ -7,6 +7,7 @@ from typing import Any, Literal ...@@ -7,6 +7,7 @@ from typing import Any, Literal
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.exceptions import LoRAAdapterNotFoundError
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.lora_model import LoRAModel from vllm.lora.lora_model import LoRAModel
from vllm.lora.model_manager import ( from vllm.lora.model_manager import (
...@@ -147,12 +148,10 @@ class WorkerLoRAManager: ...@@ -147,12 +148,10 @@ class WorkerLoRAManager:
# offline mode) # offline mode)
# - No local adapter files found at `lora_request.lora_path` # - No local adapter files found at `lora_request.lora_path`
# For NotFoundError # For NotFoundError
raise ValueError( raise LoRAAdapterNotFoundError(
f"Loading lora {lora_request.lora_name} failed: No adapter " lora_request.lora_name, lora_request.lora_path
f"found for {lora_request.lora_path}"
) from e ) from e
except Exception as e: except Exception as e:
# For BadRequestError
raise e raise e
return lora return lora
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment