Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
62de4f42
Unverified
Commit
62de4f42
authored
Dec 01, 2025
by
wang.yuqi
Committed by
GitHub
Dec 01, 2025
Browse files
[Frontend] Resettle pooling entrypoints (#29634)
Signed-off-by:
wang.yuqi
<
yuqi.wang@daocloud.io
>
parent
83805a60
Changes
39
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
1101 additions
and
20 deletions
+1101
-20
vllm/entrypoints/pooling/__init__.py
vllm/entrypoints/pooling/__init__.py
+16
-0
vllm/entrypoints/pooling/classify/__init__.py
vllm/entrypoints/pooling/classify/__init__.py
+0
-0
vllm/entrypoints/pooling/classify/api_router.py
vllm/entrypoints/pooling/classify/api_router.py
+50
-0
vllm/entrypoints/pooling/classify/protocol.py
vllm/entrypoints/pooling/classify/protocol.py
+181
-0
vllm/entrypoints/pooling/classify/serving.py
vllm/entrypoints/pooling/classify/serving.py
+7
-5
vllm/entrypoints/pooling/embed/__init__.py
vllm/entrypoints/pooling/embed/__init__.py
+0
-0
vllm/entrypoints/pooling/embed/api_router.py
vllm/entrypoints/pooling/embed/api_router.py
+67
-0
vllm/entrypoints/pooling/embed/protocol.py
vllm/entrypoints/pooling/embed/protocol.py
+208
-0
vllm/entrypoints/pooling/embed/serving.py
vllm/entrypoints/pooling/embed/serving.py
+8
-6
vllm/entrypoints/pooling/pooling/__init__.py
vllm/entrypoints/pooling/pooling/__init__.py
+0
-0
vllm/entrypoints/pooling/pooling/api_router.py
vllm/entrypoints/pooling/pooling/api_router.py
+63
-0
vllm/entrypoints/pooling/pooling/protocol.py
vllm/entrypoints/pooling/pooling/protocol.py
+148
-0
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+5
-3
vllm/entrypoints/pooling/score/__init__.py
vllm/entrypoints/pooling/score/__init__.py
+0
-0
vllm/entrypoints/pooling/score/api_router.py
vllm/entrypoints/pooling/score/api_router.py
+149
-0
vllm/entrypoints/pooling/score/protocol.py
vllm/entrypoints/pooling/score/protocol.py
+145
-0
vllm/entrypoints/pooling/score/serving.py
vllm/entrypoints/pooling/score/serving.py
+5
-3
vllm/entrypoints/sagemaker/routes.py
vllm/entrypoints/sagemaker/routes.py
+48
-2
vllm/plugins/io_processors/interface.py
vllm/plugins/io_processors/interface.py
+1
-1
No files found.
vllm/entrypoints/pooling/__init__.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
fastapi
import
FastAPI
def
register_pooling_api_routers
(
app
:
FastAPI
):
from
vllm.entrypoints.pooling.classify.api_router
import
router
as
classify_router
from
vllm.entrypoints.pooling.embed.api_router
import
router
as
embed_router
from
vllm.entrypoints.pooling.pooling.api_router
import
router
as
pooling_router
from
vllm.entrypoints.pooling.score.api_router
import
router
as
score_router
app
.
include_router
(
classify_router
)
app
.
include_router
(
embed_router
)
app
.
include_router
(
score_router
)
app
.
include_router
(
pooling_router
)
vllm/entrypoints/pooling/classify/__init__.py
0 → 100644
View file @
62de4f42
vllm/entrypoints/pooling/classify/api_router.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
starlette.responses
import
JSONResponse
from
typing_extensions
import
assert_never
from
vllm.entrypoints.openai.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.pooling.classify.protocol
import
(
ClassificationRequest
,
ClassificationResponse
,
)
from
vllm.entrypoints.pooling.classify.serving
import
ServingClassification
from
vllm.entrypoints.utils
import
load_aware_call
,
with_cancellation
router
=
APIRouter
()
def
classify
(
request
:
Request
)
->
ServingClassification
|
None
:
return
request
.
app
.
state
.
openai_serving_classification
@
router
.
post
(
"/classify"
,
dependencies
=
[
Depends
(
validate_json_request
)])
@
with_cancellation
@
load_aware_call
async
def
create_classify
(
request
:
ClassificationRequest
,
raw_request
:
Request
):
handler
=
classify
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Classification API"
)
try
:
generator
=
await
handler
.
create_classify
(
request
,
raw_request
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
)
elif
isinstance
(
generator
,
ClassificationResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
())
assert_never
(
generator
)
vllm/entrypoints/pooling/classify/protocol.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
from
typing
import
Annotated
,
Any
,
TypeAlias
from
pydantic
import
(
Field
,
)
from
vllm
import
PoolingParams
from
vllm.config.pooler
import
get_use_activation
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
from
vllm.entrypoints.openai.protocol
import
OpenAIBaseModel
,
UsageInfo
from
vllm.utils
import
random_uuid
class
ClassificationCompletionRequest
(
OpenAIBaseModel
):
model
:
str
|
None
=
None
input
:
list
[
str
]
|
str
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=-
1
)]
|
None
=
None
user
:
str
|
None
=
None
# --8<-- [start:classification-extra-params]
priority
:
int
=
Field
(
default
=
0
,
description
=
(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
add_special_tokens
:
bool
=
Field
(
default
=
True
,
description
=
(
"If true (the default), special tokens (e.g. BOS) will be added to "
"the prompt."
),
)
request_id
:
str
=
Field
(
default_factory
=
random_uuid
,
description
=
(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
softmax
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"softmax will be deprecated, please use use_activation instead."
,
)
activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"activation will be deprecated, please use use_activation instead."
,
)
use_activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to use activation for classification outputs. "
"Default is True."
,
)
# --8<-- [end:classification-extra-params]
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
use_activation
=
get_use_activation
(
self
),
)
class
ClassificationChatRequest
(
OpenAIBaseModel
):
model
:
str
|
None
=
None
messages
:
list
[
ChatCompletionMessageParam
]
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=-
1
)]
|
None
=
None
user
:
str
|
None
=
None
# --8<-- [start:chat-classification-extra-params]
add_generation_prompt
:
bool
=
Field
(
default
=
False
,
description
=
(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
add_special_tokens
:
bool
=
Field
(
default
=
False
,
description
=
(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to false (as is the "
"default)."
),
)
chat_template
:
str
|
None
=
Field
(
default
=
None
,
description
=
(
"A Jinja template to use for this conversion. "
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs
:
dict
[
str
,
Any
]
|
None
=
Field
(
default
=
None
,
description
=
(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
priority
:
int
=
Field
(
default
=
0
,
description
=
(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
request_id
:
str
=
Field
(
default_factory
=
random_uuid
,
description
=
(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
softmax
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"softmax will be deprecated, please use use_activation instead."
,
)
activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"activation will be deprecated, please use use_activation instead."
,
)
use_activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to use activation for classification outputs. "
"Default is True."
,
)
# --8<-- [end:chat-classification-extra-params]
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
use_activation
=
get_use_activation
(
self
),
)
ClassificationRequest
:
TypeAlias
=
(
ClassificationCompletionRequest
|
ClassificationChatRequest
)
class
ClassificationData
(
OpenAIBaseModel
):
index
:
int
label
:
str
|
None
probs
:
list
[
float
]
num_classes
:
int
class
ClassificationResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"classify-
{
random_uuid
()
}
"
)
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
list
[
ClassificationData
]
usage
:
UsageInfo
vllm/entrypoints/
openai/serv
ing
_
classif
ication
.py
→
vllm/entrypoints/
pool
ing
/
classif
y/serving
.py
View file @
62de4f42
...
...
@@ -13,11 +13,6 @@ from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ClassificationChatRequest
,
ClassificationCompletionRequest
,
ClassificationData
,
ClassificationRequest
,
ClassificationResponse
,
ErrorResponse
,
UsageInfo
,
)
...
...
@@ -27,6 +22,13 @@ from vllm.entrypoints.openai.serving_engine import (
ServeContext
,
)
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.pooling.classify.protocol
import
(
ClassificationChatRequest
,
ClassificationCompletionRequest
,
ClassificationData
,
ClassificationRequest
,
ClassificationResponse
,
)
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.logger
import
init_logger
from
vllm.outputs
import
ClassificationOutput
,
PoolingRequestOutput
...
...
vllm/entrypoints/pooling/embed/__init__.py
0 → 100644
View file @
62de4f42
vllm/entrypoints/pooling/embed/api_router.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
typing_extensions
import
assert_never
from
vllm.entrypoints.openai.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.pooling.embed.protocol
import
(
EmbeddingBytesResponse
,
EmbeddingRequest
,
EmbeddingResponse
,
)
from
vllm.entrypoints.pooling.embed.serving
import
OpenAIServingEmbedding
from
vllm.entrypoints.utils
import
load_aware_call
,
with_cancellation
router
=
APIRouter
()
def
embedding
(
request
:
Request
)
->
OpenAIServingEmbedding
|
None
:
return
request
.
app
.
state
.
openai_serving_embedding
@
router
.
post
(
"/v1/embeddings"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
@
load_aware_call
async
def
create_embedding
(
request
:
EmbeddingRequest
,
raw_request
:
Request
,
):
handler
=
embedding
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Embeddings API"
)
try
:
generator
=
await
handler
.
create_embedding
(
request
,
raw_request
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
)
elif
isinstance
(
generator
,
EmbeddingResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
())
elif
isinstance
(
generator
,
EmbeddingBytesResponse
):
return
StreamingResponse
(
content
=
generator
.
body
,
headers
=
{
"metadata"
:
generator
.
metadata
},
media_type
=
generator
.
media_type
,
)
assert_never
(
generator
)
vllm/entrypoints/pooling/embed/protocol.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
from
typing
import
Annotated
,
Any
,
TypeAlias
from
pydantic
import
(
Field
,
model_validator
,
)
from
vllm
import
PoolingParams
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
from
vllm.entrypoints.openai.protocol
import
OpenAIBaseModel
,
UsageInfo
from
vllm.utils
import
random_uuid
from
vllm.utils.serial_utils
import
EmbedDType
,
EncodingFormat
,
Endianness
class
EmbeddingCompletionRequest
(
OpenAIBaseModel
):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
model
:
str
|
None
=
None
input
:
list
[
int
]
|
list
[
list
[
int
]]
|
str
|
list
[
str
]
encoding_format
:
EncodingFormat
=
"float"
dimensions
:
int
|
None
=
None
user
:
str
|
None
=
None
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=-
1
)]
|
None
=
None
# --8<-- [start:embedding-extra-params]
add_special_tokens
:
bool
=
Field
(
default
=
True
,
description
=
(
"If true (the default), special tokens (e.g. BOS) will be added to "
"the prompt."
),
)
priority
:
int
=
Field
(
default
=
0
,
description
=
(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
request_id
:
str
=
Field
(
default_factory
=
random_uuid
,
description
=
(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
normalize
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to normalize the embeddings outputs. Default is True."
,
)
embed_dtype
:
EmbedDType
=
Field
(
default
=
"float32"
,
description
=
(
"What dtype to use for encoding. Default to using float32 for base64 "
"encoding to match the OpenAI python client behavior. "
"This parameter will affect base64 and binary_response."
),
)
endianness
:
Endianness
=
Field
(
default
=
"native"
,
description
=
(
"What endianness to use for encoding. Default to using native for "
"base64 encoding to match the OpenAI python client behavior."
"This parameter will affect base64 and binary_response."
),
)
# --8<-- [end:embedding-extra-params]
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
)
class
EmbeddingChatRequest
(
OpenAIBaseModel
):
model
:
str
|
None
=
None
messages
:
list
[
ChatCompletionMessageParam
]
encoding_format
:
EncodingFormat
=
"float"
dimensions
:
int
|
None
=
None
user
:
str
|
None
=
None
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=-
1
)]
|
None
=
None
# --8<-- [start:chat-embedding-extra-params]
add_generation_prompt
:
bool
=
Field
(
default
=
False
,
description
=
(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
add_special_tokens
:
bool
=
Field
(
default
=
False
,
description
=
(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to false (as is the "
"default)."
),
)
chat_template
:
str
|
None
=
Field
(
default
=
None
,
description
=
(
"A Jinja template to use for this conversion. "
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs
:
dict
[
str
,
Any
]
|
None
=
Field
(
default
=
None
,
description
=
(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
priority
:
int
=
Field
(
default
=
0
,
description
=
(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
request_id
:
str
=
Field
(
default_factory
=
random_uuid
,
description
=
(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
normalize
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to normalize the embeddings outputs. Default is True."
,
)
embed_dtype
:
EmbedDType
=
Field
(
default
=
"float32"
,
description
=
(
"What dtype to use for encoding. Default to using float32 for base64 "
"encoding to match the OpenAI python client behavior. "
"This parameter will affect base64 and binary_response."
),
)
endianness
:
Endianness
=
Field
(
default
=
"native"
,
description
=
(
"What endianness to use for encoding. Default to using native for "
"base64 encoding to match the OpenAI python client behavior."
"This parameter will affect base64 and binary_response."
),
)
# --8<-- [end:chat-embedding-extra-params]
@
model_validator
(
mode
=
"before"
)
@
classmethod
def
check_generation_prompt
(
cls
,
data
):
if
data
.
get
(
"continue_final_message"
)
and
data
.
get
(
"add_generation_prompt"
):
raise
ValueError
(
"Cannot set both `continue_final_message` and "
"`add_generation_prompt` to True."
)
return
data
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
)
EmbeddingRequest
:
TypeAlias
=
EmbeddingCompletionRequest
|
EmbeddingChatRequest
class
EmbeddingResponseData
(
OpenAIBaseModel
):
index
:
int
object
:
str
=
"embedding"
embedding
:
list
[
float
]
|
str
class
EmbeddingResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"embd-
{
random_uuid
()
}
"
)
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
list
[
EmbeddingResponseData
]
usage
:
UsageInfo
class
EmbeddingBytesResponse
(
OpenAIBaseModel
):
body
:
list
[
bytes
]
metadata
:
str
media_type
:
str
=
"application/octet-stream"
vllm/entrypoints/
openai/serv
ing
_
embed
d
ing.py
→
vllm/entrypoints/
pool
ing
/
embed
/serv
ing.py
View file @
62de4f42
...
...
@@ -13,12 +13,6 @@ from vllm.engine.protocol import EngineClient
from
vllm.entrypoints.chat_utils
import
ChatTemplateContentFormatOption
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
EmbeddingBytesResponse
,
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
EmbeddingRequest
,
EmbeddingResponse
,
EmbeddingResponseData
,
ErrorResponse
,
UsageInfo
,
)
...
...
@@ -29,6 +23,14 @@ from vllm.entrypoints.openai.serving_engine import (
TextTokensPrompt
,
)
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.pooling.embed.protocol
import
(
EmbeddingBytesResponse
,
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
EmbeddingRequest
,
EmbeddingResponse
,
EmbeddingResponseData
,
)
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.logger
import
init_logger
...
...
vllm/entrypoints/pooling/pooling/__init__.py
0 → 100644
View file @
62de4f42
vllm/entrypoints/pooling/pooling/api_router.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
typing_extensions
import
assert_never
from
vllm.entrypoints.openai.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.pooling.pooling.protocol
import
(
IOProcessorResponse
,
PoolingBytesResponse
,
PoolingRequest
,
PoolingResponse
,
)
from
vllm.entrypoints.pooling.pooling.serving
import
OpenAIServingPooling
from
vllm.entrypoints.utils
import
load_aware_call
,
with_cancellation
router
=
APIRouter
()
def
pooling
(
request
:
Request
)
->
OpenAIServingPooling
|
None
:
return
request
.
app
.
state
.
openai_serving_pooling
@
router
.
post
(
"/pooling"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
@
load_aware_call
async
def
create_pooling
(
request
:
PoolingRequest
,
raw_request
:
Request
):
handler
=
pooling
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Pooling API"
)
try
:
generator
=
await
handler
.
create_pooling
(
request
,
raw_request
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
)
elif
isinstance
(
generator
,
(
PoolingResponse
,
IOProcessorResponse
)):
return
JSONResponse
(
content
=
generator
.
model_dump
())
elif
isinstance
(
generator
,
PoolingBytesResponse
):
return
StreamingResponse
(
content
=
generator
.
body
,
headers
=
{
"metadata"
:
generator
.
metadata
},
media_type
=
generator
.
media_type
,
)
assert_never
(
generator
)
vllm/entrypoints/pooling/pooling/protocol.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
from
typing
import
Generic
,
TypeAlias
,
TypeVar
from
pydantic
import
(
Field
,
)
from
vllm
import
PoolingParams
from
vllm.config.pooler
import
get_use_activation
from
vllm.entrypoints.openai.protocol
import
OpenAIBaseModel
,
UsageInfo
from
vllm.entrypoints.pooling.embed.protocol
import
(
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
)
from
vllm.tasks
import
PoolingTask
from
vllm.utils
import
random_uuid
from
vllm.utils.serial_utils
import
EmbedDType
,
EncodingFormat
,
Endianness
class
PoolingCompletionRequest
(
EmbeddingCompletionRequest
):
task
:
PoolingTask
|
None
=
None
softmax
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"softmax will be deprecated, please use use_activation instead."
,
)
activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"activation will be deprecated, please use use_activation instead."
,
)
use_activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to use activation for classification outputs. "
"If it is a classify or token_classify task, the default is True; "
"for other tasks, this value should be None."
,
)
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
use_activation
=
get_use_activation
(
self
),
)
class
PoolingChatRequest
(
EmbeddingChatRequest
):
task
:
PoolingTask
|
None
=
None
softmax
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"softmax will be deprecated, please use use_activation instead."
,
)
activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"activation will be deprecated, please use use_activation instead."
,
)
use_activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to use activation for classification outputs. "
"If it is a classify or token_classify task, the default is True; "
"for other tasks, this value should be None."
,
)
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
use_activation
=
get_use_activation
(
self
),
)
T
=
TypeVar
(
"T"
)
class
IOProcessorRequest
(
OpenAIBaseModel
,
Generic
[
T
]):
model
:
str
|
None
=
None
priority
:
int
=
Field
(
default
=
0
)
"""
The priority of the request (lower means earlier handling;
default: 0). Any priority other than 0 will raise an error
if the served model does not use priority scheduling.
"""
data
:
T
task
:
PoolingTask
=
"plugin"
encoding_format
:
EncodingFormat
=
"float"
embed_dtype
:
EmbedDType
=
Field
(
default
=
"float32"
,
description
=
(
"What dtype to use for encoding. Default to using float32 for base64 "
"encoding to match the OpenAI python client behavior. "
"This parameter will affect base64 and binary_response."
),
)
endianness
:
Endianness
=
Field
(
default
=
"native"
,
description
=
(
"What endianness to use for encoding. Default to using native for "
"base64 encoding to match the OpenAI python client behavior."
"This parameter will affect base64 and binary_response."
),
)
def
to_pooling_params
(
self
):
return
PoolingParams
()
class
IOProcessorResponse
(
OpenAIBaseModel
,
Generic
[
T
]):
request_id
:
str
|
None
=
None
"""
The request_id associated with this response
"""
created_at
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
data
:
T
"""
When using plugins IOProcessor plugins, the actual output is generated
by the plugin itself. Hence, we use a generic type for the response data
"""
PoolingRequest
:
TypeAlias
=
(
PoolingCompletionRequest
|
PoolingChatRequest
|
IOProcessorRequest
)
class
PoolingResponseData
(
OpenAIBaseModel
):
index
:
int
object
:
str
=
"pooling"
data
:
list
[
list
[
float
]]
|
list
[
float
]
|
str
class
PoolingResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"pool-
{
random_uuid
()
}
"
)
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
list
[
PoolingResponseData
]
usage
:
UsageInfo
class
PoolingBytesResponse
(
OpenAIBaseModel
):
body
:
list
[
bytes
]
metadata
:
str
media_type
:
str
=
"application/octet-stream"
vllm/entrypoints/
openai/serving_pool
ing.py
→
vllm/entrypoints/
pooling/pooling/serv
ing.py
View file @
62de4f42
...
...
@@ -16,6 +16,11 @@ from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
UsageInfo
,
)
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.pooling.pooling.protocol
import
(
IOProcessorRequest
,
IOProcessorResponse
,
PoolingBytesResponse
,
...
...
@@ -24,10 +29,7 @@ from vllm.entrypoints.openai.protocol import (
PoolingRequest
,
PoolingResponse
,
PoolingResponseData
,
UsageInfo
,
)
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.entrypoints.utils
import
_validate_truncation_size
from
vllm.logger
import
init_logger
...
...
vllm/entrypoints/pooling/score/__init__.py
0 → 100644
View file @
62de4f42
vllm/entrypoints/pooling/score/api_router.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
fastapi
import
APIRouter
,
Depends
,
HTTPException
,
Request
from
fastapi.responses
import
JSONResponse
from
typing_extensions
import
assert_never
from
vllm.entrypoints.openai.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.utils
import
validate_json_request
from
vllm.entrypoints.pooling.score.protocol
import
(
RerankRequest
,
RerankResponse
,
ScoreRequest
,
ScoreResponse
,
)
from
vllm.entrypoints.pooling.score.serving
import
ServingScores
from
vllm.entrypoints.utils
import
load_aware_call
,
with_cancellation
from
vllm.logger
import
init_logger
router
=
APIRouter
()
logger
=
init_logger
(
__name__
)
def
score
(
request
:
Request
)
->
ServingScores
|
None
:
return
request
.
app
.
state
.
openai_serving_scores
def
rerank
(
request
:
Request
)
->
ServingScores
|
None
:
return
request
.
app
.
state
.
openai_serving_scores
@
router
.
post
(
"/score"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
@
load_aware_call
async
def
create_score
(
request
:
ScoreRequest
,
raw_request
:
Request
):
handler
=
score
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Score API"
)
try
:
generator
=
await
handler
.
create_score
(
request
,
raw_request
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
)
elif
isinstance
(
generator
,
ScoreResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
())
assert_never
(
generator
)
@
router
.
post
(
"/v1/score"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
@
load_aware_call
async
def
create_score_v1
(
request
:
ScoreRequest
,
raw_request
:
Request
):
logger
.
warning
(
"To indicate that Score API is not part of standard OpenAI API, we "
"have moved it to `/score`. Please update your client accordingly."
)
return
await
create_score
(
request
,
raw_request
)
@
router
.
post
(
"/rerank"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
@
load_aware_call
async
def
do_rerank
(
request
:
RerankRequest
,
raw_request
:
Request
):
handler
=
rerank
(
raw_request
)
if
handler
is
None
:
base_server
=
raw_request
.
app
.
state
.
openai_serving_tokenization
return
base_server
.
create_error_response
(
message
=
"The model does not support Rerank (Score) API"
)
try
:
generator
=
await
handler
.
do_rerank
(
request
,
raw_request
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
,
detail
=
str
(
e
)
)
from
e
if
isinstance
(
generator
,
ErrorResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
(),
status_code
=
generator
.
error
.
code
)
elif
isinstance
(
generator
,
RerankResponse
):
return
JSONResponse
(
content
=
generator
.
model_dump
())
assert_never
(
generator
)
@
router
.
post
(
"/v1/rerank"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
async
def
do_rerank_v1
(
request
:
RerankRequest
,
raw_request
:
Request
):
logger
.
warning_once
(
"To indicate that the rerank API is not part of the standard OpenAI"
" API, we have located it at `/rerank`. Please update your client "
"accordingly. (Note: Conforms to JinaAI rerank API)"
)
return
await
do_rerank
(
request
,
raw_request
)
@
router
.
post
(
"/v2/rerank"
,
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
HTTPStatus
.
BAD_REQUEST
.
value
:
{
"model"
:
ErrorResponse
},
HTTPStatus
.
INTERNAL_SERVER_ERROR
.
value
:
{
"model"
:
ErrorResponse
},
},
)
@
with_cancellation
async
def
do_rerank_v2
(
request
:
RerankRequest
,
raw_request
:
Request
):
return
await
do_rerank
(
request
,
raw_request
)
vllm/entrypoints/pooling/score/protocol.py
0 → 100644
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
from
typing
import
Annotated
,
Any
from
pydantic
import
(
BaseModel
,
Field
,
)
from
vllm
import
PoolingParams
from
vllm.config.pooler
import
get_use_activation
from
vllm.entrypoints.openai.protocol
import
OpenAIBaseModel
,
UsageInfo
from
vllm.entrypoints.score_utils
import
ScoreContentPartParam
,
ScoreMultiModalParam
from
vllm.utils
import
random_uuid
class
ScoreRequest
(
OpenAIBaseModel
):
model
:
str
|
None
=
None
text_1
:
list
[
str
]
|
str
|
ScoreMultiModalParam
text_2
:
list
[
str
]
|
str
|
ScoreMultiModalParam
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=-
1
)]
|
None
=
None
# --8<-- [start:score-extra-params]
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
priority
:
int
=
Field
(
default
=
0
,
description
=
(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
softmax
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"softmax will be deprecated, please use use_activation instead."
,
)
activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"activation will be deprecated, please use use_activation instead."
,
)
use_activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to use activation for classification outputs. "
"Default is True."
,
)
# --8<-- [end:score-extra-params]
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
use_activation
=
get_use_activation
(
self
),
)
class
RerankRequest
(
OpenAIBaseModel
):
model
:
str
|
None
=
None
query
:
str
|
ScoreMultiModalParam
documents
:
list
[
str
]
|
ScoreMultiModalParam
top_n
:
int
=
Field
(
default_factory
=
lambda
:
0
)
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=-
1
)]
|
None
=
None
# --8<-- [start:rerank-extra-params]
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
priority
:
int
=
Field
(
default
=
0
,
description
=
(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
softmax
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"softmax will be deprecated, please use use_activation instead."
,
)
activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"activation will be deprecated, please use use_activation instead."
,
)
use_activation
:
bool
|
None
=
Field
(
default
=
None
,
description
=
"Whether to use activation for classification outputs. "
"Default is True."
,
)
# --8<-- [end:rerank-extra-params]
def
to_pooling_params
(
self
):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
use_activation
=
get_use_activation
(
self
),
)
class
RerankDocument
(
BaseModel
):
text
:
str
|
None
=
None
multi_modal
:
ScoreContentPartParam
|
None
=
None
class
RerankResult
(
BaseModel
):
index
:
int
document
:
RerankDocument
relevance_score
:
float
class
RerankUsage
(
BaseModel
):
total_tokens
:
int
class
RerankResponse
(
OpenAIBaseModel
):
id
:
str
model
:
str
usage
:
RerankUsage
results
:
list
[
RerankResult
]
class
ScoreResponseData
(
OpenAIBaseModel
):
index
:
int
object
:
str
=
"score"
score
:
float
class
ScoreResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"embd-
{
random_uuid
()
}
"
)
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
list
[
ScoreResponseData
]
usage
:
UsageInfo
vllm/entrypoints/
openai
/serving
_score
.py
→
vllm/entrypoints/
pooling/score
/serving.py
View file @
62de4f42
...
...
@@ -11,6 +11,11 @@ from vllm.engine.protocol import EngineClient
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
UsageInfo
,
)
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.pooling.score.protocol
import
(
RerankDocument
,
RerankRequest
,
RerankResponse
,
...
...
@@ -19,10 +24,7 @@ from vllm.entrypoints.openai.protocol import (
ScoreRequest
,
ScoreResponse
,
ScoreResponseData
,
UsageInfo
,
)
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.score_utils
import
(
ScoreContentPartParam
,
ScoreMultiModalParam
,
...
...
vllm/entrypoints/sagemaker/routes.py
View file @
62de4f42
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
collections.abc
import
Awaitable
,
Callable
from
http
import
HTTPStatus
from
typing
import
Any
import
model_hosting_container_standards.sagemaker
as
sagemaker_standards
import
pydantic
...
...
@@ -9,12 +11,56 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from
fastapi.responses
import
JSONResponse
,
Response
from
vllm.entrypoints.openai.api_server
import
(
INVOCATION_VALIDATORS
,
base
,
chat
,
completion
,
create_chat_completion
,
create_completion
,
health
,
validate_json_request
,
)
from
vllm.entrypoints.openai.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
ErrorResponse
,
)
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.pooling.classify.api_router
import
classify
,
create_classify
from
vllm.entrypoints.pooling.classify.protocol
import
ClassificationRequest
from
vllm.entrypoints.pooling.embed.api_router
import
create_embedding
,
embedding
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingRequest
from
vllm.entrypoints.pooling.pooling.api_router
import
create_pooling
,
pooling
from
vllm.entrypoints.pooling.pooling.protocol
import
PoolingRequest
from
vllm.entrypoints.pooling.score.api_router
import
(
create_score
,
do_rerank
,
rerank
,
score
,
)
from
vllm.entrypoints.pooling.score.protocol
import
RerankRequest
,
ScoreRequest
# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
# (requires typing_extensions >= 4.13)
RequestType
=
Any
GetHandlerFn
=
Callable
[[
Request
],
OpenAIServing
|
None
]
EndpointFn
=
Callable
[[
RequestType
,
Request
],
Awaitable
[
Any
]]
# NOTE: Items defined earlier take higher priority
INVOCATION_TYPES
:
list
[
tuple
[
RequestType
,
tuple
[
GetHandlerFn
,
EndpointFn
]]]
=
[
(
ChatCompletionRequest
,
(
chat
,
create_chat_completion
)),
(
CompletionRequest
,
(
completion
,
create_completion
)),
(
EmbeddingRequest
,
(
embedding
,
create_embedding
)),
(
ClassificationRequest
,
(
classify
,
create_classify
)),
(
ScoreRequest
,
(
score
,
create_score
)),
(
RerankRequest
,
(
rerank
,
do_rerank
)),
(
PoolingRequest
,
(
pooling
,
create_pooling
)),
]
# NOTE: Construct the TypeAdapters only once
INVOCATION_VALIDATORS
=
[
(
pydantic
.
TypeAdapter
(
request_type
),
(
get_handler
,
endpoint
))
for
request_type
,
(
get_handler
,
endpoint
)
in
INVOCATION_TYPES
]
def
register_sagemaker_routes
(
router
:
APIRouter
):
...
...
vllm/plugins/io_processors/interface.py
View file @
62de4f42
...
...
@@ -6,7 +6,7 @@ from collections.abc import AsyncGenerator, Sequence
from
typing
import
Any
,
Generic
,
TypeVar
from
vllm.config
import
VllmConfig
from
vllm.entrypoints.
openai
.protocol
import
IOProcessorResponse
from
vllm.entrypoints.
pooling.pooling
.protocol
import
IOProcessorResponse
from
vllm.inputs.data
import
PromptType
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.pooling_params
import
PoolingParams
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment