Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d9e00dbd
Unverified
Commit
d9e00dbd
authored
Aug 29, 2025
by
wang.yuqi
Committed by
GitHub
Aug 29, 2025
Browse files
[Performance] V1 Classify Models E2E Performance Optimization (#23541)
Signed-off-by:
wang.yuqi
<
noooop@126.com
>
parent
ad39106b
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
80 additions
and
37 deletions
+80
-37
tests/entrypoints/llm/test_classify.py
tests/entrypoints/llm/test_classify.py
+6
-0
tests/entrypoints/openai/test_classification.py
tests/entrypoints/openai/test_classification.py
+30
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-5
vllm/model_executor/layers/pooler.py
vllm/model_executor/layers/pooler.py
+32
-28
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+11
-4
No files found.
tests/entrypoints/llm/test_classify.py
View file @
d9e00dbd
...
@@ -62,3 +62,9 @@ def test_encode_api(llm: LLM):
...
@@ -62,3 +62,9 @@ def test_encode_api(llm: LLM):
err_msg
=
"pooling_task must be one of.+"
err_msg
=
"pooling_task must be one of.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
encode
(
prompts
,
use_tqdm
=
False
)
llm
.
encode
(
prompts
,
use_tqdm
=
False
)
def
test_score_api
(
llm
:
LLM
):
err_msg
=
"Score API is only enabled for num_labels == 1."
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
tests/entrypoints/openai/test_classification.py
View file @
d9e00dbd
...
@@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
},
},
)
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_score
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
# score api is only enabled for num_labels == 1.
response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
model_name
,
"text_1"
:
"ping"
,
"text_2"
:
"pong"
,
},
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_rerank
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
# rerank api is only enabled for num_labels == 1.
response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
json
=
{
"model"
:
model_name
,
"query"
:
"ping"
,
"documents"
:
[
"pong"
],
},
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
vllm/entrypoints/openai/api_server.py
View file @
d9e00dbd
...
@@ -1805,17 +1805,13 @@ async def init_app_state(
...
@@ -1805,17 +1805,13 @@ async def init_app_state(
request_logger
=
request_logger
,
request_logger
=
request_logger
,
log_error_stack
=
args
.
log_error_stack
,
log_error_stack
=
args
.
log_error_stack
,
)
if
"classify"
in
supported_tasks
else
None
)
if
"classify"
in
supported_tasks
else
None
enable_serving_reranking
=
(
"classify"
in
supported_tasks
and
getattr
(
model_config
.
hf_config
,
"num_labels"
,
0
)
==
1
)
state
.
openai_serving_scores
=
ServingScores
(
state
.
openai_serving_scores
=
ServingScores
(
engine_client
,
engine_client
,
model_config
,
model_config
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
log_error_stack
=
args
.
log_error_stack
,
log_error_stack
=
args
.
log_error_stack
,
)
if
(
"embed"
in
supported_tasks
or
enable_serving_reranking
)
else
None
)
if
(
"embed"
in
supported_tasks
or
"score"
in
supported_tasks
)
else
None
state
.
openai_serving_tokenization
=
OpenAIServingTokenization
(
state
.
openai_serving_tokenization
=
OpenAIServingTokenization
(
engine_client
,
engine_client
,
model_config
,
model_config
,
...
...
vllm/model_executor/layers/pooler.py
View file @
d9e00dbd
...
@@ -13,12 +13,15 @@ import torch.nn.functional as F
...
@@ -13,12 +13,15 @@ import torch.nn.functional as F
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm.config
import
ModelConfig
,
PoolerConfig
from
vllm.config
import
ModelConfig
,
PoolerConfig
from
vllm.logger
import
init_logger
from
vllm.pooling_params
import
PoolingParams
from
vllm.pooling_params
import
PoolingParams
from
vllm.sequence
import
PoolerOutput
,
PoolingSequenceGroupOutput
from
vllm.sequence
import
PoolerOutput
,
PoolingSequenceGroupOutput
from
vllm.tasks
import
PoolingTask
from
vllm.tasks
import
PoolingTask
from
vllm.utils
import
current_stream
,
resolve_obj_by_qualname
from
vllm.utils
import
current_stream
,
resolve_obj_by_qualname
from
vllm.v1.pool.metadata
import
PoolingCursor
,
PoolingMetadata
from
vllm.v1.pool.metadata
import
PoolingCursor
,
PoolingMetadata
logger
=
init_logger
(
__name__
)
PoolingFn
=
Callable
[
PoolingFn
=
Callable
[
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
PoolingMetadata
],
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
PoolingMetadata
],
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]]
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]]
...
@@ -183,7 +186,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
...
@@ -183,7 +186,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
fn
=
resolve_obj_by_qualname
(
function_name
)()
fn
=
resolve_obj_by_qualname
(
function_name
)()
return
PoolerActivation
.
wraps
(
fn
)
return
PoolerActivation
.
wraps
(
fn
)
return
Pooler
Score
()
return
Pooler
Classify
()
def
build_output
(
def
build_output
(
...
@@ -371,22 +374,29 @@ class PoolerMultiLabelClassify(PoolerActivation):
...
@@ -371,22 +374,29 @@ class PoolerMultiLabelClassify(PoolerActivation):
class
PoolerClassify
(
PoolerActivation
):
class
PoolerClassify
(
PoolerActivation
):
def
forward_chunk
(
self
,
pooled_data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
__init__
(
self
,
*
,
static_num_labels
:
bool
=
True
)
->
None
:
num_labels
=
pooled_data
.
shape
[
-
1
]
super
().
__init__
()
if
num_labels
<
2
:
return
F
.
sigmoid
(
pooled_data
.
float
()).
to
(
pooled_data
.
dtype
)
return
F
.
softmax
(
pooled_data
.
float
(),
dim
=-
1
).
to
(
pooled_data
.
dtype
)
class
PoolerScore
(
PoolerActivation
):
if
static_num_labels
:
from
vllm.config
import
get_current_vllm_config
vllm_config
=
get_current_vllm_config
()
self
.
num_labels
=
getattr
(
vllm_config
.
model_config
.
hf_config
,
"num_labels"
,
0
)
if
self
.
num_labels
==
0
:
logger
.
warning
(
"num_labels should be > 0 for classification"
"models, falling back to softmax. "
"Please check if the configuration is correct."
)
else
:
self
.
num_labels
=
None
def
forward_chunk
(
self
,
pooled_data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward_chunk
(
self
,
pooled_data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
num_labels
=
pooled_data
.
shape
[
-
1
]
num_labels
=
(
self
.
num_labels
if
self
.
num_labels
is
not
None
else
pooled_data
.
shape
[
-
1
])
if
num_labels
<
2
:
if
num_labels
<
2
:
return
F
.
sigmoid
(
pooled_data
.
float
()).
to
(
pooled_data
.
dtype
)
return
F
.
sigmoid
(
pooled_data
.
float
()).
to
(
pooled_data
.
dtype
)
return
pooled_data
return
F
.
softmax
(
pooled_data
.
float
(),
dim
=-
1
).
to
(
pooled_data
.
dtype
)
class
LambdaPoolerActivation
(
PoolerActivation
):
class
LambdaPoolerActivation
(
PoolerActivation
):
...
@@ -428,6 +438,10 @@ class EmbeddingPoolerHead(PoolerHead):
...
@@ -428,6 +438,10 @@ class EmbeddingPoolerHead(PoolerHead):
def
forward
(
self
,
pooled_data
:
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
],
def
forward
(
self
,
pooled_data
:
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
],
pooling_metadata
:
PoolingMetadata
):
pooling_metadata
:
PoolingMetadata
):
if
isinstance
(
pooled_data
,
list
):
pooled_data
=
torch
.
stack
(
pooled_data
)
# pooled_data shape: [batchsize, hidden_dimension]
# Apply ST projector
# Apply ST projector
if
self
.
projector
is
not
None
:
if
self
.
projector
is
not
None
:
projector
=
cast
(
nn
.
Module
,
self
.
projector
)
projector
=
cast
(
nn
.
Module
,
self
.
projector
)
...
@@ -437,17 +451,11 @@ class EmbeddingPoolerHead(PoolerHead):
...
@@ -437,17 +451,11 @@ class EmbeddingPoolerHead(PoolerHead):
y
=
projector
(
x
.
to
(
torch
.
float32
))
y
=
projector
(
x
.
to
(
torch
.
float32
))
return
y
.
to
(
orig_dtype
)
return
y
.
to
(
orig_dtype
)
if
isinstance
(
pooled_data
,
torch
.
Tensor
):
pooled_data
=
_proj
(
pooled_data
)
pooled_data
=
_proj
(
pooled_data
)
# pooled_data shape: [batchsize, embedding_dimension]
else
:
pooled_data
=
[
_proj
(
t
)
for
t
in
pooled_data
]
pooling_params
=
get_pooling_params
(
pooling_metadata
)
pooling_params
=
get_pooling_params
(
pooling_metadata
)
if
isinstance
(
pooled_data
,
list
):
pooled_data
=
torch
.
stack
(
pooled_data
)
# pooled_data shape: [batchsize, embedding_dimension]
# for matryoshka representation
# for matryoshka representation
dimensions_list
=
[
dimensions_list
=
[
pooling_param
.
dimensions
for
pooling_param
in
pooling_params
pooling_param
.
dimensions
for
pooling_param
in
pooling_params
...
@@ -477,13 +485,14 @@ class EmbeddingPoolerHead(PoolerHead):
...
@@ -477,13 +485,14 @@ class EmbeddingPoolerHead(PoolerHead):
for
vecs
,
f
in
zip
(
pooled_data
,
flags
)
for
vecs
,
f
in
zip
(
pooled_data
,
flags
)
]
]
# pooled_data shape: [batchsize, embedding_dimension]
return
pooled_data
return
pooled_data
class
RewardPoolerHead
(
PoolerHead
):
class
RewardPoolerHead
(
PoolerHead
):
def
__init__
(
self
)
->
None
:
def
__init__
(
self
)
->
None
:
super
().
__init__
(
activation
=
PoolerClassify
())
super
().
__init__
(
activation
=
PoolerClassify
(
static_num_labels
=
False
))
def
forward
(
self
,
pooled_data
:
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
],
def
forward
(
self
,
pooled_data
:
Union
[
list
[
torch
.
Tensor
],
torch
.
Tensor
],
pooling_metadata
:
PoolingMetadata
):
pooling_metadata
:
PoolingMetadata
):
...
@@ -637,19 +646,13 @@ class ClassifierPooler(Pooler):
...
@@ -637,19 +646,13 @@ class ClassifierPooler(Pooler):
pooling_metadata
:
PoolingMetadata
,
pooling_metadata
:
PoolingMetadata
,
)
->
PoolerOutput
:
)
->
PoolerOutput
:
pooled_data
=
self
.
pooling
(
hidden_states
,
pooling_metadata
)
pooled_data
=
self
.
pooling
(
hidden_states
,
pooling_metadata
)
if
isinstance
(
pooled_data
,
list
):
if
isinstance
(
pooled_data
,
list
):
pooled_data
=
torch
.
stack
(
pooled_data
)
pooled_data
=
torch
.
stack
(
pooled_data
)
# pooled_data shape: [batchsize, hidden_size]
# pooled_data shape: [batchsize, hidden_size]
if
self
.
classifier
is
not
None
:
if
self
.
classifier
is
not
None
:
# apply classifier once on the full batch if possible
pooled_data
=
self
.
classifier
(
pooled_data
)
if
isinstance
(
pooled_data
,
torch
.
Tensor
):
# pooled_data shape: [batchsize, num_labels]
pooled_data
=
self
.
classifier
(
pooled_data
)
elif
len
({
data
.
shape
for
data
in
pooled_data
})
<=
1
:
pooled_data
=
self
.
classifier
(
torch
.
stack
(
pooled_data
))
else
:
pooled_data
=
[
self
.
classifier
(
data
)
for
data
in
pooled_data
]
pooling_params
=
get_pooling_params
(
pooling_metadata
)
pooling_params
=
get_pooling_params
(
pooling_metadata
)
flags
=
[
p
.
activation
for
p
in
pooling_params
]
flags
=
[
p
.
activation
for
p
in
pooling_params
]
...
@@ -662,6 +665,7 @@ class ClassifierPooler(Pooler):
...
@@ -662,6 +665,7 @@ class ClassifierPooler(Pooler):
for
vecs
,
f
in
zip
(
pooled_data
,
flags
)
for
vecs
,
f
in
zip
(
pooled_data
,
flags
)
]
]
# scores shape: [batchsize, num_labels]
return
build_output
(
scores
)
return
build_output
(
scores
)
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
d9e00dbd
...
@@ -1248,10 +1248,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1248,10 +1248,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
and
"encode"
in
supported_tasks
):
and
"encode"
in
supported_tasks
):
supported_tasks
.
remove
(
"encode"
)
supported_tasks
.
remove
(
"encode"
)
logger
.
info_once
(
"Chunked prefill is not supported with "
logger
.
debug_once
(
"Chunked prefill is not supported with "
"encode task which using ALL pooling. "
"encode task which using ALL pooling. "
"Please turn off chunked prefill by "
"Please turn off chunked prefill by "
"`--no-enable-chunked-prefill` before using it."
)
"`--no-enable-chunked-prefill` before using it."
)
if
"score"
in
supported_tasks
:
num_labels
=
getattr
(
self
.
model_config
.
hf_config
,
"num_labels"
,
0
)
if
num_labels
!=
1
:
supported_tasks
.
remove
(
"score"
)
logger
.
debug_once
(
"Score API is only enabled for num_labels == 1."
)
return
supported_tasks
return
supported_tasks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment