Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0c492b78
Unverified
Commit
0c492b78
authored
May 28, 2025
by
Cyrus Leung
Committed by
GitHub
May 28, 2025
Browse files
[Deprecation] Remove fallbacks for Embeddings API (#18795)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
0f0926b4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
60 deletions
+12
-60
vllm/config.py
vllm/config.py
+6
-11
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+5
-34
vllm/outputs.py
vllm/outputs.py
+1
-15
No files found.
vllm/config.py
View file @
0c492b78
...
...
@@ -797,17 +797,12 @@ class ModelConfig:
else
:
# Aliases
if
task_option
==
"embedding"
:
preferred_task
=
self
.
_get_preferred_task
(
architectures
,
supported_tasks
)
if
preferred_task
!=
"embed"
:
msg
=
(
"The 'embedding' task will be restricted to "
"embedding models in a future release. Please "
"pass `--task classify`, `--task score`, or "
"`--task reward` explicitly for other pooling "
"models."
)
warnings
.
warn
(
msg
,
DeprecationWarning
,
stacklevel
=
2
)
task_option
=
preferred_task
or
"embed"
msg
=
(
"The 'embedding' task has been renamed to "
"'embed', please use the new name. The old name "
"will be removed in v1.0."
)
warnings
.
warn
(
msg
,
DeprecationWarning
,
stacklevel
=
2
)
task_option
=
"embed"
if
task_option
not
in
supported_tasks
:
msg
=
(
...
...
vllm/entrypoints/openai/api_server.py
View file @
0c492b78
...
...
@@ -17,7 +17,7 @@ from contextlib import asynccontextmanager
from
functools
import
partial
from
http
import
HTTPStatus
from
json
import
JSONDecodeError
from
typing
import
Annotated
,
Optional
,
Union
from
typing
import
Annotated
,
Optional
import
prometheus_client
import
regex
as
re
...
...
@@ -59,9 +59,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
EmbeddingRequest
,
EmbeddingResponse
,
EmbeddingResponseData
,
ErrorResponse
,
EmbeddingResponse
,
ErrorResponse
,
LoadLoRAAdapterRequest
,
PoolingChatRequest
,
PoolingCompletionRequest
,
...
...
@@ -627,37 +625,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
async
def
create_embedding
(
request
:
EmbeddingRequest
,
raw_request
:
Request
):
handler
=
embedding
(
raw_request
)
if
handler
is
None
:
fallback_handler
=
pooling
(
raw_request
)
if
fallback_handler
is
None
:
return
base
(
raw_request
).
create_error_response
(
message
=
"The model does not support Embeddings API"
)
return
base
(
raw_request
).
create_error_response
(
message
=
"The model does not support Embeddings API"
)
logger
.
warning
(
"Embeddings API will become exclusive to embedding models "
"in a future release. To return the hidden states directly, "
"use the Pooling API (`/pooling`) instead."
)
res
=
await
fallback_handler
.
create_pooling
(
request
,
raw_request
)
generator
:
Union
[
ErrorResponse
,
EmbeddingResponse
]
if
isinstance
(
res
,
PoolingResponse
):
generator
=
EmbeddingResponse
(
id
=
res
.
id
,
object
=
res
.
object
,
created
=
res
.
created
,
model
=
res
.
model
,
data
=
[
EmbeddingResponseData
(
index
=
d
.
index
,
embedding
=
d
.
data
,
# type: ignore
)
for
d
in
res
.
data
],
usage
=
res
.
usage
,
)
else
:
generator
=
res
else
:
generator
=
await
handler
.
create_embedding
(
request
,
raw_request
)
generator
=
await
handler
.
create_embedding
(
request
,
raw_request
)
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
(),
...
...
vllm/outputs.py
View file @
0c492b78
...
...
@@ -7,7 +7,7 @@ from dataclasses import dataclass
from
typing
import
Any
,
Generic
,
Optional
,
Union
import
torch
from
typing_extensions
import
TypeVar
,
deprecated
from
typing_extensions
import
TypeVar
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
...
...
@@ -76,14 +76,6 @@ class PoolingOutput:
return
(
isinstance
(
other
,
self
.
__class__
)
and
bool
(
(
self
.
data
==
other
.
data
).
all
()))
@
property
@
deprecated
(
"`LLM.encode()` now stores raw outputs in the `data` "
"attribute. To return embeddings, use `LLM.embed()`. "
"To return class probabilities, use `LLM.classify()` "
"and access the `probs` attribute. "
)
def
embedding
(
self
)
->
list
[
float
]:
return
self
.
data
.
tolist
()
class
RequestOutput
:
"""The output data of a completion request to the LLM.
...
...
@@ -506,12 +498,6 @@ class ScoringOutput:
def
__repr__
(
self
)
->
str
:
return
f
"ScoringOutput(score=
{
self
.
score
}
)"
@
property
@
deprecated
(
"`LLM.score()` now returns scalar scores. "
"Please access it via the `score` attribute. "
)
def
embedding
(
self
)
->
list
[
float
]:
return
[
self
.
score
]
class
ScoringRequestOutput
(
PoolingRequestOutput
[
ScoringOutput
]):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment