Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fe714dd5
Unverified
Commit
fe714dd5
authored
Mar 11, 2026
by
Ning Xie
Committed by
GitHub
Mar 10, 2026
Browse files
[openapi server] log exception in exception handler(2/N) (#36201)
Signed-off-by:
Andy Xie
<
andy.xning@gmail.com
>
parent
8ab3d742
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
63 additions
and
113 deletions
+63
-113
tests/entrypoints/openai/test_lora_adapters.py
tests/entrypoints/openai/test_lora_adapters.py
+3
-3
tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
+1
-1
vllm/entrypoints/anthropic/api_router.py
vllm/entrypoints/anthropic/api_router.py
+2
-2
vllm/entrypoints/openai/chat_completion/api_router.py
vllm/entrypoints/openai/chat_completion/api_router.py
+1
-4
vllm/entrypoints/openai/completion/api_router.py
vllm/entrypoints/openai/completion/api_router.py
+1
-4
vllm/entrypoints/openai/models/serving.py
vllm/entrypoints/openai/models/serving.py
+11
-25
vllm/entrypoints/openai/responses/api_router.py
vllm/entrypoints/openai/responses/api_router.py
+3
-12
vllm/entrypoints/openai/speech_to_text/api_router.py
vllm/entrypoints/openai/speech_to_text/api_router.py
+2
-8
vllm/entrypoints/pooling/classify/api_router.py
vllm/entrypoints/pooling/classify/api_router.py
+2
-9
vllm/entrypoints/pooling/embed/api_router.py
vllm/entrypoints/pooling/embed/api_router.py
+2
-9
vllm/entrypoints/pooling/pooling/api_router.py
vllm/entrypoints/pooling/pooling/api_router.py
+1
-4
vllm/entrypoints/pooling/score/api_router.py
vllm/entrypoints/pooling/score/api_router.py
+2
-8
vllm/entrypoints/serve/disagg/api_router.py
vllm/entrypoints/serve/disagg/api_router.py
+1
-3
vllm/entrypoints/serve/render/api_router.py
vllm/entrypoints/serve/render/api_router.py
+3
-16
vllm/exceptions.py
vllm/exceptions.py
+25
-1
vllm/lora/worker_manager.py
vllm/lora/worker_manager.py
+3
-4
No files found.
tests/entrypoints/openai/test_lora_adapters.py
View file @
fe714dd5
...
@@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
...
@@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
invalid_files
.
mkdir
()
invalid_files
.
mkdir
()
(
invalid_files
/
"adapter_config.json"
).
write_text
(
"this is not json"
)
(
invalid_files
/
"adapter_config.json"
).
write_text
(
"this is not json"
)
with
pytest
.
raises
(
openai
.
BadRequest
Error
):
with
pytest
.
raises
(
openai
.
InternalServer
Error
):
await
client
.
post
(
await
client
.
post
(
"load_lora_adapter"
,
"load_lora_adapter"
,
cast_to
=
str
,
cast_to
=
str
,
...
@@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests(
...
@@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests(
json
.
dump
(
adapter_config
,
f
)
json
.
dump
(
adapter_config
,
f
)
# Test loading the adapter
# Test loading the adapter
with
pytest
.
raises
(
openai
.
BadRequest
Error
,
match
=
expected_error
):
with
pytest
.
raises
(
openai
.
InternalServer
Error
,
match
=
expected_error
):
await
client
.
post
(
await
client
.
post
(
"load_lora_adapter"
,
"load_lora_adapter"
,
cast_to
=
str
,
cast_to
=
str
,
...
@@ -312,7 +312,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
...
@@ -312,7 +312,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
body
=
{
"lora_name"
:
"notfound"
,
"lora_path"
:
"/not/an/adapter"
},
body
=
{
"lora_name"
:
"notfound"
,
"lora_path"
:
"/not/an/adapter"
},
)
)
for
_
in
range
(
25
):
for
_
in
range
(
25
):
with
suppress
(
openai
.
BadRequest
Error
):
with
suppress
(
openai
.
InternalServer
Error
):
await
client
.
post
(
await
client
.
post
(
"load_lora_adapter"
,
"load_lora_adapter"
,
cast_to
=
str
,
cast_to
=
str
,
...
...
tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
View file @
fe714dd5
...
@@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files(
...
@@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files(
basic_server_with_lora
.
url_for
(
"adapters"
),
basic_server_with_lora
.
url_for
(
"adapters"
),
json
=
{
"name"
:
"invalid-adapter"
,
"src"
:
str
(
invalid_files
)},
json
=
{
"name"
:
"invalid-adapter"
,
"src"
:
str
(
invalid_files
)},
)
)
assert
load_response
.
status_code
==
4
00
assert
load_response
.
status_code
==
5
00
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
vllm/entrypoints/anthropic/api_router.py
View file @
fe714dd5
...
@@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
...
@@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
error
=
base_server
.
create_error_response
(
error
=
base_server
.
create_error_response
(
message
=
"The model does not support Messages API"
NotImplementedError
(
"The model does not support Messages API"
)
)
)
return
translate_error_response
(
error
)
return
translate_error_response
(
error
)
...
@@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques
...
@@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
error
=
base_server
.
create_error_response
(
error
=
base_server
.
create_error_response
(
message
=
"The model does not support Messages API"
NotImplementedError
(
"The model does not support Messages API"
)
)
)
return
translate_error_response
(
error
)
return
translate_error_response
(
error
)
...
...
vllm/entrypoints/openai/chat_completion/api_router.py
View file @
fe714dd5
...
@@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
...
@@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
)
)
handler
=
chat
(
raw_request
)
handler
=
chat
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Chat Completions API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Chat Completions API"
)
generator
=
await
handler
.
create_chat_completion
(
request
,
raw_request
)
generator
=
await
handler
.
create_chat_completion
(
request
,
raw_request
)
...
...
vllm/entrypoints/openai/completion/api_router.py
View file @
fe714dd5
...
@@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
...
@@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
)
)
handler
=
completion
(
raw_request
)
handler
=
completion
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Completions API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Completions API"
)
generator
=
await
handler
.
create_completion
(
request
,
raw_request
)
generator
=
await
handler
.
create_completion
(
request
,
raw_request
)
...
...
vllm/entrypoints/openai/models/serving.py
View file @
fe714dd5
...
@@ -7,7 +7,6 @@ from http import HTTPStatus
...
@@ -7,7 +7,6 @@ from http import HTTPStatus
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.openai.engine.protocol
import
(
from
vllm.entrypoints.openai.engine.protocol
import
(
ErrorInfo
,
ErrorResponse
,
ErrorResponse
,
ModelCard
,
ModelCard
,
ModelList
,
ModelList
,
...
@@ -18,7 +17,8 @@ from vllm.entrypoints.serve.lora.protocol import (
...
@@ -18,7 +17,8 @@ from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest
,
LoadLoRAAdapterRequest
,
UnloadLoRAAdapterRequest
,
UnloadLoRAAdapterRequest
,
)
)
from
vllm.entrypoints.utils
import
sanitize_message
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.exceptions
import
LoRAAdapterNotFoundError
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
...
@@ -152,15 +152,15 @@ class OpenAIServingModels:
...
@@ -152,15 +152,15 @@ class OpenAIServingModels:
try
:
try
:
await
self
.
engine_client
.
add_lora
(
lora_request
)
await
self
.
engine_client
.
add_lora
(
lora_request
)
except
Exception
as
e
:
except
Exception
as
e
:
error_type
=
"BadRequestError"
if
str
(
status_code
=
HTTPStatus
.
BAD_REQUEST
LoRAAdapterNotFoundError
(
if
"No adapter found"
in
str
(
e
):
lora_request
.
lora_name
,
lora_request
.
lora_path
error_type
=
"NotFoundError"
)
status_code
=
HTTPStatus
.
NOT_FOUND
)
in
str
(
e
):
raise
LoRAAdapterNotFoundError
(
return
create_error_response
(
lora_request
.
lora_name
,
lora_request
.
lora_path
message
=
str
(
e
),
err_type
=
error_type
,
status_code
=
status_cod
e
)
from
e
)
raise
self
.
lora_requests
[
lora_name
]
=
lora_request
self
.
lora_requests
[
lora_name
]
=
lora_request
logger
.
info
(
logger
.
info
(
...
@@ -292,17 +292,3 @@ class OpenAIServingModels:
...
@@ -292,17 +292,3 @@ class OpenAIServingModels:
err_type
=
"NotFoundError"
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
)
def
create_error_response
(
message
:
str
,
err_type
:
str
=
"BadRequestError"
,
status_code
:
HTTPStatus
=
HTTPStatus
.
BAD_REQUEST
,
)
->
ErrorResponse
:
return
ErrorResponse
(
error
=
ErrorInfo
(
message
=
sanitize_message
(
message
),
type
=
err_type
,
code
=
status_code
.
value
,
)
)
vllm/entrypoints/openai/responses/api_router.py
View file @
fe714dd5
...
@@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events(
...
@@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events(
async
def
create_responses
(
request
:
ResponsesRequest
,
raw_request
:
Request
):
async
def
create_responses
(
request
:
ResponsesRequest
,
raw_request
:
Request
):
handler
=
responses
(
raw_request
)
handler
=
responses
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Responses API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Responses API"
)
generator
=
await
handler
.
create_responses
(
request
,
raw_request
)
generator
=
await
handler
.
create_responses
(
request
,
raw_request
)
...
@@ -88,10 +85,7 @@ async def retrieve_responses(
...
@@ -88,10 +85,7 @@ async def retrieve_responses(
):
):
handler
=
responses
(
raw_request
)
handler
=
responses
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Responses API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Responses API"
)
response
=
await
handler
.
retrieve_responses
(
response
=
await
handler
.
retrieve_responses
(
response_id
,
response_id
,
...
@@ -115,10 +109,7 @@ async def retrieve_responses(
...
@@ -115,10 +109,7 @@ async def retrieve_responses(
async
def
cancel_responses
(
response_id
:
str
,
raw_request
:
Request
):
async
def
cancel_responses
(
response_id
:
str
,
raw_request
:
Request
):
handler
=
responses
(
raw_request
)
handler
=
responses
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Responses API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Responses API"
)
response
=
await
handler
.
cancel_responses
(
response_id
)
response
=
await
handler
.
cancel_responses
(
response_id
)
...
...
vllm/entrypoints/openai/speech_to_text/api_router.py
View file @
fe714dd5
...
@@ -65,10 +65,7 @@ async def create_transcriptions(
...
@@ -65,10 +65,7 @@ async def create_transcriptions(
):
):
handler
=
transcription
(
raw_request
)
handler
=
transcription
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Transcriptions API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Transcriptions API"
)
audio_data
=
await
request
.
file
.
read
()
audio_data
=
await
request
.
file
.
read
()
...
@@ -101,10 +98,7 @@ async def create_translations(
...
@@ -101,10 +98,7 @@ async def create_translations(
):
):
handler
=
translation
(
raw_request
)
handler
=
translation
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Translations API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Translations API"
)
audio_data
=
await
request
.
file
.
read
()
audio_data
=
await
request
.
file
.
read
()
...
...
vllm/entrypoints/pooling/classify/api_router.py
View file @
fe714dd5
...
@@ -2,13 +2,12 @@
...
@@ -2,13 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi.responses
import
JSONResponse
,
Response
from
fastapi.responses
import
Response
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.pooling.classify.protocol
import
ClassificationRequest
from
vllm.entrypoints.pooling.classify.protocol
import
ClassificationRequest
from
vllm.entrypoints.pooling.classify.serving
import
ServingClassification
from
vllm.entrypoints.pooling.classify.serving
import
ServingClassification
from
vllm.entrypoints.utils
import
(
from
vllm.entrypoints.utils
import
(
create_error_response
,
load_aware_call
,
load_aware_call
,
with_cancellation
,
with_cancellation
,
)
)
...
@@ -28,12 +27,6 @@ async def create_classify(
...
@@ -28,12 +27,6 @@ async def create_classify(
)
->
Response
:
)
->
Response
:
handler
=
classify
(
raw_request
)
handler
=
classify
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
error_response
=
create_error_response
(
raise
NotImplementedError
(
"The model does not support Classification API"
)
message
=
"The model does not support Classification API"
)
return
JSONResponse
(
content
=
error_response
.
model_dump
(),
status_code
=
error_response
.
error
.
code
,
)
return
await
handler
(
request
,
raw_request
)
return
await
handler
(
request
,
raw_request
)
vllm/entrypoints/pooling/embed/api_router.py
View file @
fe714dd5
...
@@ -4,14 +4,12 @@
...
@@ -4,14 +4,12 @@
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi
import
APIRouter
,
Depends
,
Request
from
fastapi.responses
import
JSONResponse
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingRequest
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingRequest
from
vllm.entrypoints.pooling.embed.serving
import
ServingEmbedding
from
vllm.entrypoints.pooling.embed.serving
import
ServingEmbedding
from
vllm.entrypoints.utils
import
(
from
vllm.entrypoints.utils
import
(
create_error_response
,
load_aware_call
,
load_aware_call
,
with_cancellation
,
with_cancellation
,
)
)
...
@@ -39,11 +37,6 @@ async def create_embedding(
...
@@ -39,11 +37,6 @@ async def create_embedding(
):
):
handler
=
embedding
(
raw_request
)
handler
=
embedding
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
error_response
=
create_error_response
(
raise
NotImplementedError
(
"The model does not support Embeddings API"
)
message
=
"The model does not support Embeddings API"
)
return
JSONResponse
(
content
=
error_response
.
model_dump
(),
status_code
=
error_response
.
error
.
code
,
)
return
await
handler
(
request
,
raw_request
)
return
await
handler
(
request
,
raw_request
)
vllm/entrypoints/pooling/pooling/api_router.py
View file @
fe714dd5
...
@@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None:
...
@@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None:
async
def
create_pooling
(
request
:
PoolingRequest
,
raw_request
:
Request
):
async
def
create_pooling
(
request
:
PoolingRequest
,
raw_request
:
Request
):
handler
=
pooling
(
raw_request
)
handler
=
pooling
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Pooling API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Pooling API"
)
generator
=
await
handler
.
create_pooling
(
request
,
raw_request
)
generator
=
await
handler
.
create_pooling
(
request
,
raw_request
)
...
...
vllm/entrypoints/pooling/score/api_router.py
View file @
fe714dd5
...
@@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None:
...
@@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None:
async
def
create_score
(
request
:
ScoreRequest
,
raw_request
:
Request
):
async
def
create_score
(
request
:
ScoreRequest
,
raw_request
:
Request
):
handler
=
score
(
raw_request
)
handler
=
score
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Score API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Score API"
)
generator
=
await
handler
.
create_score
(
request
,
raw_request
)
generator
=
await
handler
.
create_score
(
request
,
raw_request
)
...
@@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
...
@@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
async
def
do_rerank
(
request
:
RerankRequest
,
raw_request
:
Request
):
async
def
do_rerank
(
request
:
RerankRequest
,
raw_request
:
Request
):
handler
=
rerank
(
raw_request
)
handler
=
rerank
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
raise
NotImplementedError
(
"The model does not support Rerank (Score) API"
)
return
base_server
.
create_error_response
(
message
=
"The model does not support Rerank (Score) API"
)
generator
=
await
handler
.
do_rerank
(
request
,
raw_request
)
generator
=
await
handler
.
do_rerank
(
request
,
raw_request
)
...
...
vllm/entrypoints/serve/disagg/api_router.py
View file @
fe714dd5
...
@@ -61,9 +61,7 @@ router = APIRouter()
...
@@ -61,9 +61,7 @@ router = APIRouter()
async
def
generate
(
request
:
GenerateRequest
,
raw_request
:
Request
):
async
def
generate
(
request
:
GenerateRequest
,
raw_request
:
Request
):
handler
=
generate_tokens
(
raw_request
)
handler
=
generate_tokens
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
return
tokenization
(
raw_request
).
create_error_response
(
raise
NotImplementedError
(
"The model does not support generate tokens API"
)
message
=
"The model does not support generate tokens API"
)
generator
=
await
handler
.
serve_tokens
(
request
,
raw_request
)
generator
=
await
handler
.
serve_tokens
(
request
,
raw_request
)
...
...
vllm/entrypoints/serve/render/api_router.py
View file @
fe714dd5
...
@@ -10,7 +10,6 @@ from vllm.entrypoints.openai.completion.protocol import CompletionRequest
...
@@ -10,7 +10,6 @@ from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None:
...
@@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None:
async
def
render_chat_completion
(
request
:
ChatCompletionRequest
,
raw_request
:
Request
):
async
def
render_chat_completion
(
request
:
ChatCompletionRequest
,
raw_request
:
Request
):
handler
=
render
(
raw_request
)
handler
=
render
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
error
=
create_error_response
(
raise
NotImplementedError
(
message
=
"The model does not support Chat Completions Render API"
,
"The model does not support Chat Completions Render API"
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
return
JSONResponse
(
status_code
=
HTTPStatus
.
NOT_FOUND
,
content
=
error
.
model_dump
()
)
)
result
=
await
handler
.
render_chat_request
(
request
)
result
=
await
handler
.
render_chat_request
(
request
)
...
@@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
...
@@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
async
def
render_completion
(
request
:
CompletionRequest
,
raw_request
:
Request
):
async
def
render_completion
(
request
:
CompletionRequest
,
raw_request
:
Request
):
handler
=
render
(
raw_request
)
handler
=
render
(
raw_request
)
if
handler
is
None
:
if
handler
is
None
:
error
=
create_error_response
(
raise
NotImplementedError
(
"The model does not support Completions Render API"
)
message
=
"The model does not support Completions Render API"
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
return
JSONResponse
(
status_code
=
HTTPStatus
.
NOT_FOUND
,
content
=
error
.
model_dump
()
)
result
=
await
handler
.
render_completion_request
(
request
)
result
=
await
handler
.
render_completion_request
(
request
)
...
...
vllm/exceptions.py
View file @
fe714dd5
...
@@ -36,7 +36,31 @@ class VLLMValidationError(ValueError):
...
@@ -36,7 +36,31 @@ class VLLMValidationError(ValueError):
return
f
"
{
base
}
(
{
', '
.
join
(
extras
)
}
)"
if
extras
else
base
return
f
"
{
base
}
(
{
', '
.
join
(
extras
)
}
)"
if
extras
else
base
class
VLLMNotFoundError
(
ValueError
):
class
VLLMNotFoundError
(
Exception
):
"""vLLM-specific NotFoundError"""
"""vLLM-specific NotFoundError"""
pass
pass
class
LoRAAdapterNotFoundError
(
VLLMNotFoundError
):
"""Exception raised when a LoRA adapter is not found.
This exception is thrown when a requested LoRA adapter does not exist
in the system.
Attributes:
message: The error message string describing the exception
"""
message
:
str
def
__init__
(
self
,
lora_name
:
str
,
lora_path
:
str
,
)
->
None
:
message
=
f
"Loading lora
{
lora_name
}
failed: No adapter found for
{
lora_path
}
"
self
.
message
=
message
def
__str__
(
self
):
return
self
.
message
vllm/lora/worker_manager.py
View file @
fe714dd5
...
@@ -7,6 +7,7 @@ from typing import Any, Literal
...
@@ -7,6 +7,7 @@ from typing import Any, Literal
import
torch
import
torch
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.exceptions
import
LoRAAdapterNotFoundError
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.lora_model
import
LoRAModel
from
vllm.lora.lora_model
import
LoRAModel
from
vllm.lora.model_manager
import
(
from
vllm.lora.model_manager
import
(
...
@@ -147,12 +148,10 @@ class WorkerLoRAManager:
...
@@ -147,12 +148,10 @@ class WorkerLoRAManager:
# offline mode)
# offline mode)
# - No local adapter files found at `lora_request.lora_path`
# - No local adapter files found at `lora_request.lora_path`
# For NotFoundError
# For NotFoundError
raise
ValueError
(
raise
LoRAAdapterNotFoundError
(
f
"Loading lora
{
lora_request
.
lora_name
}
failed: No adapter "
lora_request
.
lora_name
,
lora_request
.
lora_path
f
"found for
{
lora_request
.
lora_path
}
"
)
from
e
)
from
e
except
Exception
as
e
:
except
Exception
as
e
:
# For BadRequestError
raise
e
raise
e
return
lora
return
lora
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment