Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f54f8512
Unverified
Commit
f54f8512
authored
Oct 15, 2025
by
wang.yuqi
Committed by
GitHub
Oct 15, 2025
Browse files
[Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)
Signed-off-by:
wang.yuqi
<
noooop@126.com
>
parent
d4d1a602
Changes
41
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
383 additions
and
76 deletions
+383
-76
examples/offline_inference/pooling/README.md
examples/offline_inference/pooling/README.md
+6
-0
examples/offline_inference/pooling/multi_vector_retrieval.py
examples/offline_inference/pooling/multi_vector_retrieval.py
+56
-0
examples/offline_inference/prithvi_geospatial_mae_io_processor.py
.../offline_inference/prithvi_geospatial_mae_io_processor.py
+1
-1
examples/online_serving/pooling/README.md
examples/online_serving/pooling/README.md
+6
-0
examples/online_serving/pooling/multi_vector_retrieval_client.py
...s/online_serving/pooling/multi_vector_retrieval_client.py
+54
-0
tests/conftest.py
tests/conftest.py
+6
-2
tests/entrypoints/pooling/llm/test_classify.py
tests/entrypoints/pooling/llm/test_classify.py
+1
-1
tests/entrypoints/pooling/llm/test_embedding.py
tests/entrypoints/pooling/llm/test_embedding.py
+7
-0
tests/entrypoints/pooling/llm/test_encode.py
tests/entrypoints/pooling/llm/test_encode.py
+8
-4
tests/entrypoints/pooling/llm/test_reward.py
tests/entrypoints/pooling/llm/test_reward.py
+12
-11
tests/entrypoints/pooling/openai/test_embedding.py
tests/entrypoints/pooling/openai/test_embedding.py
+18
-0
tests/entrypoints/pooling/openai/test_rerank.py
tests/entrypoints/pooling/openai/test_rerank.py
+18
-1
tests/models/language/pooling/test_multi_vector_retrieval.py
tests/models/language/pooling/test_multi_vector_retrieval.py
+45
-0
tests/models/language/pooling/test_pooler_config_init_behaviour.py
...els/language/pooling/test_pooler_config_init_behaviour.py
+50
-8
tests/models/language/pooling/test_token_classification.py
tests/models/language/pooling/test_token_classification.py
+2
-2
tests/models/multimodal/pooling/test_prithvi_mae.py
tests/models/multimodal/pooling/test_prithvi_mae.py
+1
-1
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+1
-1
tests/plugins_tests/test_io_processor_plugins.py
tests/plugins_tests/test_io_processor_plugins.py
+2
-3
tests/test_pooling_params.py
tests/test_pooling_params.py
+71
-22
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+18
-19
No files found.
examples/offline_inference/pooling/README.md
View file @
f54f8512
...
...
@@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
python examples/offline_inference/pooling/embed_matryoshka_fy.py
```
## Multi vector retrieval usage
```
bash
python examples/offline_inference/pooling/multi_vector_retrieval.py
```
## Named Entity Recognition (NER) usage
```
bash
...
...
examples/offline_inference/pooling/multi_vector_retrieval.py
0 → 100644
View file @
f54f8512
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
def
parse_args
():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"BAAI/bge-m3"
,
runner
=
"pooling"
,
enforce_eager
=
True
,
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create an LLM.
# You should pass runner="pooling" for embedding models
llm
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs
=
llm
.
embed
(
prompts
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
print
(
len
(
embeds
))
# Generate embedding for each token. The output is a list of PoolingRequestOutput.
outputs
=
llm
.
encode
(
prompts
,
pooling_task
=
"token_embed"
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
multi_vector
=
output
.
outputs
.
data
print
(
multi_vector
.
shape
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/prithvi_geospatial_mae_io_processor.py
View file @
f54f8512
...
...
@@ -40,7 +40,7 @@ def main():
model_impl
=
"terratorch"
,
)
pooling_params
=
PoolingParams
(
task
=
"
encode"
,
softmax
=
False
)
pooling_params
=
PoolingParams
(
task
=
"
token_classify"
,
activation
=
False
)
pooler_output
=
llm
.
encode
(
img_prompt
,
pooling_params
=
pooling_params
,
...
...
examples/online_serving/pooling/README.md
View file @
f54f8512
...
...
@@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py
python examples/online_serving/pooling/jinaai_rerank_client.py
```
## Multi vector retrieval usage
```
bash
python examples/online_serving/pooling/multi_vector_retrieval_client.py
```
## Named Entity Recognition (NER) usage
```
bash
...
...
examples/online_serving/pooling/multi_vector_retrieval_client.py
0 → 100644
View file @
f54f8512
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example online usage of Pooling API for multi vector retrieval.
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
vllm serve BAAI/bge-m3
"""
import
argparse
import
requests
import
torch
def
post_http_request
(
prompt
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
response
=
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
prompt
)
return
response
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"BAAI/bge-m3"
)
return
parser
.
parse_args
()
def
main
(
args
):
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
model_name
=
args
.
model
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompt
=
{
"model"
:
model_name
,
"input"
:
prompts
}
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
for
output
in
pooling_response
.
json
()[
"data"
]:
multi_vector
=
torch
.
tensor
(
output
[
"data"
])
print
(
multi_vector
.
shape
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
tests/conftest.py
View file @
f54f8512
...
...
@@ -1011,8 +1011,12 @@ class VllmRunner:
req_outputs
=
self
.
llm
.
embed
(
inputs
,
*
args
,
**
kwargs
)
return
[
req_output
.
outputs
.
embedding
for
req_output
in
req_outputs
]
def
encode
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
req_outputs
=
self
.
llm
.
encode
(
prompts
)
def
token_embed
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
req_outputs
=
self
.
llm
.
encode
(
prompts
,
pooling_task
=
"token_embed"
)
return
[
req_output
.
outputs
.
data
for
req_output
in
req_outputs
]
def
token_classify
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
req_outputs
=
self
.
llm
.
encode
(
prompts
,
pooling_task
=
"token_classify"
)
return
[
req_output
.
outputs
.
data
for
req_output
in
req_outputs
]
def
reward
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
...
...
tests/entrypoints/pooling/llm/test_classify.py
View file @
f54f8512
...
...
@@ -63,7 +63,7 @@ def test_encode_api(llm: LLM):
# chunked prefill does not support all pooling
err_msg
=
"pooling_task must be one of.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
encode
(
prompts
,
use_tqdm
=
False
)
llm
.
encode
(
prompts
,
pooling_task
=
"token_classify"
,
use_tqdm
=
False
)
def
test_score_api
(
llm
:
LLM
):
...
...
tests/entrypoints/pooling/llm/test_embedding.py
View file @
f54f8512
...
...
@@ -35,6 +35,13 @@ def llm():
cleanup_dist_env_and_memory
()
@
pytest
.
mark
.
skip_global_cleanup
def
test_encode_api
(
llm
:
LLM
):
outputs
=
llm
.
encode
(
prompts
,
pooling_task
=
"token_embed"
,
use_tqdm
=
False
)
multi_vector
=
outputs
[
0
].
outputs
.
data
assert
multi_vector
.
shape
==
(
11
,
384
)
def
test_pooling_params
(
llm
:
LLM
):
def
get_outputs
(
normalize
):
outputs
=
llm
.
embed
(
...
...
tests/entrypoints/pooling/llm/test_encode.py
View file @
f54f8512
...
...
@@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM):
]
# Multiple PoolingParams should be matched with each prompt
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
)
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
,
pooling_task
=
"embed"
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
# Exception raised, if the size of params does not match the size of prompts
with
pytest
.
raises
(
ValueError
):
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
[:
3
])
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
[:
3
],
pooling_task
=
"embed"
)
# Single PoolingParams should be applied to every prompt
single_pooling_params
=
PoolingParams
()
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
single_pooling_params
)
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
single_pooling_params
,
pooling_task
=
"embed"
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
# pooling_params is None, default params should be applied
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
None
)
outputs
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
None
,
pooling_task
=
"embed"
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
...
...
tests/entrypoints/pooling/llm/test_reward.py
View file @
f54f8512
...
...
@@ -36,22 +36,23 @@ def llm():
cleanup_dist_env_and_memory
()
@
pytest
.
mark
.
skip_global_cleanup
def
test_pooling_params
(
llm
:
LLM
):
def
get_outputs
(
softmax
):
def
get_outputs
(
activation
):
outputs
=
llm
.
reward
(
prompts
,
pooling_params
=
PoolingParams
(
softmax
=
softmax
),
use_tqdm
=
False
prompts
,
pooling_params
=
PoolingParams
(
activation
=
activation
),
use_tqdm
=
False
)
return
torch
.
cat
([
x
.
outputs
.
data
for
x
in
outputs
])
default
=
get_outputs
(
softmax
=
None
)
w_
softmax
=
get_outputs
(
softmax
=
True
)
wo_
softmax
=
get_outputs
(
softmax
=
False
)
default
=
get_outputs
(
activation
=
None
)
w_
activation
=
get_outputs
(
activation
=
True
)
wo_
activation
=
get_outputs
(
activation
=
False
)
assert
torch
.
allclose
(
default
,
w_softmax
,
atol
=
1e-2
),
"Default should use softmax."
assert
not
torch
.
allclose
(
w_softmax
,
wo_softmax
,
atol
=
1e-2
),
(
"wo_softmax should not use softmax."
assert
torch
.
allclose
(
default
,
w_activation
,
atol
=
1e-2
),
(
"Default should use activation."
)
assert
torch
.
allclose
(
softmax
(
wo_softmax
),
w_softmax
,
atol
=
1e-2
),
(
"w_softmax should be close to softmax(wo_softmax)."
assert
not
torch
.
allclose
(
w_activation
,
wo_activation
,
atol
=
1e-2
),
(
"wo_activation should not use activation."
)
assert
torch
.
allclose
(
softmax
(
wo_activation
),
w_activation
,
atol
=
1e-2
),
(
"w_activation should be close to activation(wo_activation)."
)
tests/entrypoints/pooling/openai/test_embedding.py
View file @
f54f8512
...
...
@@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer
from
vllm.entrypoints.openai.protocol
import
(
EMBED_DTYPE_TO_TORCH_DTYPE
,
EmbeddingResponse
,
PoolingResponse
,
)
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
...
@@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
assert
torch
.
allclose
(
w_normal
,
F
.
normalize
(
wo_normal
,
p
=
2
,
dim
=-
1
),
atol
=
1e-2
),
(
"w_normal should be close to normal(wo_normal)."
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_pooling
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
input_text
=
[
"The chef prepared a delicious meal."
]
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
{
"model"
:
model_name
,
"input"
:
input_text
,
"encoding_format"
:
"float"
},
)
poolings
=
PoolingResponse
.
model_validate
(
response
.
json
())
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
11
assert
len
(
poolings
.
data
[
0
].
data
[
0
])
==
384
tests/entrypoints/pooling/openai/test_rerank.py
View file @
f54f8512
...
...
@@ -7,7 +7,7 @@ import torch
import
torch.nn.functional
as
F
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.openai.protocol
import
RerankResponse
from
vllm.entrypoints.openai.protocol
import
PoolingResponse
,
RerankResponse
MODEL_NAME
=
"BAAI/bge-reranker-base"
DTYPE
=
"bfloat16"
...
...
@@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
assert
torch
.
allclose
(
F
.
sigmoid
(
wo_activation
),
w_activation
,
atol
=
1e-2
),
(
"w_activation should be close to activation(wo_activation)."
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_pooling
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
input_text
=
[
"The chef prepared a delicious meal."
]
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
{
"model"
:
model_name
,
"input"
:
input_text
,
"encoding_format"
:
"float"
},
)
poolings
=
PoolingResponse
.
model_validate
(
response
.
json
())
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
11
assert
len
(
poolings
.
data
[
0
].
data
[
0
])
==
1
tests/models/language/pooling/test_multi_vector_retrieval.py
0 → 100644
View file @
f54f8512
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
transformers
import
AutoModel
from
tests.models.utils
import
check_embeddings_close
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"BAAI/bge-m3"
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
torch
.
inference_mode
def
test_embed_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
):
with
vllm_runner
(
model
,
runner
=
"pooling"
,
max_model_len
=
None
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_embed
(
example_prompts
)
with
hf_runner
(
model
,
auto_cls
=
AutoModel
,
)
as
hf_model
:
tokenizer
=
hf_model
.
tokenizer
hf_outputs
=
[]
for
prompt
in
example_prompts
:
inputs
=
tokenizer
([
prompt
],
return_tensors
=
"pt"
)
inputs
=
hf_model
.
wrap_device
(
inputs
)
output
=
hf_model
.
model
(
**
inputs
)
embedding
=
output
.
last_hidden_state
[
0
].
float
()
# normal
hf_outputs
.
append
(
embedding
.
cpu
())
for
hf_output
,
vllm_output
in
zip
(
hf_outputs
,
vllm_outputs
):
check_embeddings_close
(
embeddings_0_lst
=
hf_output
,
embeddings_1_lst
=
vllm_output
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
tests/models/language/pooling/test_pooler_config_init_behaviour.py
View file @
f54f8512
...
...
@@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_reward_models_using_
softmax
(
def
test_reward_models_using_
activation
(
hf_runner
,
vllm_runner
,
example_prompts
,
...
...
@@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
model
,
max_model_len
=
1024
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
softmax
=
False
),
pooler_config
=
PoolerConfig
(
activation
=
False
),
)
as
vllm_model
:
wo_
softmax
=
vllm_model
.
encode
(
example_prompts
)
wo_
activation
=
vllm_model
.
reward
(
example_prompts
)
with
vllm_runner
(
model
,
max_model_len
=
1024
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
softmax
=
True
)
model
,
max_model_len
=
1024
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
activation
=
True
),
)
as
vllm_model
:
w_
softmax
=
vllm_model
.
encode
(
example_prompts
)
w_
activation
=
vllm_model
.
reward
(
example_prompts
)
for
wo
,
w
in
zip
(
wo_
softmax
,
w_softmax
):
for
wo
,
w
in
zip
(
wo_
activation
,
w_activation
):
wo
=
torch
.
tensor
(
wo
)
w
=
torch
.
tensor
(
w
)
assert
not
torch
.
allclose
(
wo
,
w
,
atol
=
1e-2
),
(
"pooler_config
softmax
is not working"
"pooler_config
activation
is not working"
)
assert
torch
.
allclose
(
softmax
(
wo
),
w
,
atol
=
1e-2
),
(
"w_softmax should be close to softmax(wo_softmax)."
"w_activation should be close to activation(wo_activation)."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"intfloat/multilingual-e5-small"
,
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_multi_vector_retrieval_models_using_normalize
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
normalize
=
False
),
)
as
vllm_model
:
wo_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
with
vllm_runner
(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
normalize
=
True
),
)
as
vllm_model
:
w_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
for
wo
,
w
in
zip
(
wo_normalize
,
w_normalize
):
assert
not
torch
.
allclose
(
wo
,
w
,
atol
=
1e-2
),
(
"pooler_config normalize is not working"
)
assert
torch
.
allclose
(
F
.
normalize
(
wo
,
p
=
2
,
dim
=-
1
),
w
,
atol
=
1e-2
),
(
"w_normal should be close to normal(wo_normal)."
)
tests/models/language/pooling/test_token_classification.py
View file @
f54f8512
...
...
@@ -19,7 +19,7 @@ def test_bert_models(
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
max_model_len
=
None
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
vllm_outputs
=
vllm_model
.
token_classify
(
example_prompts
)
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForTokenClassification
...
...
@@ -50,7 +50,7 @@ def test_modernbert_models(
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
max_model_len
=
None
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
vllm_outputs
=
vllm_model
.
token_classify
(
example_prompts
)
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForTokenClassification
...
...
tests/models/multimodal/pooling/test_prithvi_mae.py
View file @
f54f8512
...
...
@@ -39,7 +39,7 @@ def _run_test(
max_num_seqs
=
32
,
default_torch_num_threads
=
1
,
)
as
vllm_model
:
vllm_model
.
encode
(
prompt
)
vllm_model
.
llm
.
encode
(
prompt
,
pooling_task
=
"token_classify"
)
MODELS
=
[
"mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
]
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
View file @
f54f8512
...
...
@@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module):
self
.
pooler
=
DispatchPooler
(
{
"
encode
"
:
Pooler
.
for_
encode
(
pooler_config
),
"
token_embed
"
:
Pooler
.
for_
token_embed
(
pooler_config
),
"embed"
:
Pooler
.
for_embed
(
pooler_config
),
}
)
...
...
tests/plugins_tests/test_io_processor_plugins.py
View file @
f54f8512
...
...
@@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
out_data_format
=
"b64_json"
,
)
pooling_params
=
PoolingParams
(
task
=
"encode"
,
softmax
=
False
)
pooling_params
=
PoolingParams
(
activation
=
False
)
with
vllm_runner
(
model_name
,
...
...
@@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
io_processor_plugin
=
"prithvi_to_tiff"
,
)
as
llm_runner
:
pooler_output
=
llm_runner
.
get_llm
().
encode
(
img_prompt
,
pooling_params
=
pooling_params
,
img_prompt
,
pooling_params
=
pooling_params
,
pooling_task
=
"token_classify"
)
output
=
pooler_output
[
0
].
outputs
...
...
tests/test_pooling_params.py
View file @
f54f8512
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
import
pytest
from
tests.models.utils
import
EmbedModelInfo
from
vllm
import
PoolingParams
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
,
PoolerConfig
EMBEDDING_MODELS
=
[
EmbedModelInfo
(
"intfloat/multilingual-e5-small"
,
is_matryoshka
=
False
),
...
...
@@ -15,6 +17,15 @@ EMBEDDING_MODELS = [
),
]
classify_parameters
=
[
"activation"
]
embed_parameters
=
[
"dimensions"
,
"normalize"
]
step_pooling_parameters
=
[
"step_tag_id"
,
"returned_token_ids"
]
@
dataclass
()
class
MockModelConfig
:
pooler_config
:
PoolerConfig
def
test_task
():
pooling_params
=
PoolingParams
()
...
...
@@ -24,25 +35,27 @@ def test_task():
pooling_params
.
verify
(
task
=
"score"
)
with
pytest
.
raises
(
ValueError
):
pooling_params
.
verify
(
task
=
"
encode
"
)
pooling_params
.
verify
(
task
=
"
classify
"
)
def
test_embed
():
task
=
"embed"
model_config
=
MockModelConfig
(
pooler_config
=
PoolerConfig
(
pooling_type
=
"CLS"
))
pooling_params
=
PoolingParams
(
normalize
=
None
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
normalize
=
True
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
normalize
=
False
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
invalid_parameters
=
[
"activation"
,
"softmax"
]
invalid_parameters
=
classify_parameters
+
step_pooling_parameters
for
p
in
invalid_parameters
:
with
pytest
.
raises
(
ValueError
):
pooling_params
=
PoolingParams
(
**
{
p
:
True
})
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
@
pytest
.
mark
.
parametrize
(
"model_info"
,
EMBEDDING_MODELS
)
...
...
@@ -73,35 +86,71 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"score"
,
"classify"
])
def
test_classify
(
task
):
model_config
=
MockModelConfig
(
pooler_config
=
PoolerConfig
(
pooling_type
=
"CLS"
))
pooling_params
=
PoolingParams
(
activation
=
None
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
activation
=
True
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
activation
=
False
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
invalid_parameters
=
embed_parameters
+
step_pooling_parameters
for
p
in
invalid_parameters
:
with
pytest
.
raises
(
ValueError
):
pooling_params
=
PoolingParams
(
**
{
p
:
True
})
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
@
pytest
.
mark
.
parametrize
(
"pooling_type"
,
[
"ALL"
,
"STEP"
])
def
test_token_embed
(
pooling_type
:
str
):
task
=
"token_embed"
model_config
=
MockModelConfig
(
pooler_config
=
PoolerConfig
(
pooling_type
=
pooling_type
)
)
pooling_params
=
PoolingParams
(
normalize
=
None
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
normalize
=
True
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
normalize
=
False
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
invalid_parameters
=
classify_parameters
if
pooling_type
!=
"STEP"
:
invalid_parameters
=
classify_parameters
+
step_pooling_parameters
invalid_parameters
=
[
"dimensions"
,
"normalize"
,
"softmax"
]
for
p
in
invalid_parameters
:
with
pytest
.
raises
(
ValueError
):
pooling_params
=
PoolingParams
(
**
{
p
:
True
})
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
def
test_encode
():
task
=
"encode"
pooling_params
=
PoolingParams
(
softmax
=
None
)
pooling_params
.
verify
(
task
=
task
)
@
pytest
.
mark
.
parametrize
(
"pooling_type"
,
[
"ALL"
,
"STEP"
])
def
test_token_classify
(
pooling_type
:
str
):
task
=
"token_classify"
model_config
=
MockModelConfig
(
pooler_config
=
PoolerConfig
(
pooling_type
=
pooling_type
)
)
pooling_params
=
PoolingParams
(
activation
=
None
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
activation
=
True
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
softmax
=
Tru
e
)
pooling_params
.
verify
(
task
=
task
)
pooling_params
=
PoolingParams
(
activation
=
Fals
e
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
pooling_params
=
PoolingParams
(
softmax
=
False
)
pooling_params
.
verify
(
task
=
task
)
invalid_parameters
=
embed_parameters
if
pooling_type
!=
"STEP"
:
invalid_parameters
=
embed_parameters
+
step_pooling_parameters
invalid_parameters
=
[
"dimensions"
,
"normalize"
,
"activation"
]
for
p
in
invalid_parameters
:
with
pytest
.
raises
(
ValueError
):
pooling_params
=
PoolingParams
(
**
{
p
:
True
})
pooling_params
.
verify
(
task
=
task
)
pooling_params
.
verify
(
task
=
task
,
model_config
=
model_config
)
vllm/entrypoints/llm.py
View file @
f54f8512
...
...
@@ -951,7 +951,7 @@ class LLM:
truncate_prompt_tokens
:
int
|
None
=
None
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
pooling_task
:
PoolingTask
=
"encode"
,
pooling_task
:
PoolingTask
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
PoolingRequestOutput
]:
"""Apply pooling to the hidden states corresponding to the input
...
...
@@ -986,26 +986,25 @@ class LLM:
instead pass them via the `inputs` parameter.
"""
if
self
.
supported_tasks
==
[
"encode"
]
and
pooling_task
is
None
:
pooling_task
=
"encode"
if
pooling_task
is
None
:
pooling_task
=
"embed"
if
"embed"
in
self
.
supported_tasks
else
"encode"
logger
.
warning_once
(
"`LLM.encode` is currently using `pooling_task = %s`.
\n
"
error_str
=
(
"pooling_task required for `LLM.encode`
\n
"
"Please use one of the more specific methods or set the "
"task directly
when using `LLM.encode`:
\n
"
"pooling_task
when using `LLM.encode`:
\n
"
" - For embeddings, use `LLM.embed(...)` "
'or `pooling_task="embed"`.
\n
'
" - For classification logits, use `LLM.classify(...)` "
'or `pooling_task="classify"`.
\n
'
" - For similarity scores, use `LLM.score(...)`.
\n
"
" - For rewards, use `LLM.reward(...)` "
'or `pooling_task="reward"`
\n
'
" - For similarity scores, use `LLM.score(...)`."
,
pooling_task
,
'or `pooling_task="token_classify"`
\n
'
" - For token classification, "
'use `pooling_task="token_classify"`
\n
'
' - For multi-vector retrieval, use `pooling_task="token_embed"`'
)
if
pooling_task
is
None
:
raise
ValueError
(
error_str
)
model_config
=
self
.
model_config
runner_type
=
model_config
.
runner_type
if
runner_type
!=
"pooling"
:
...
...
@@ -1206,7 +1205,7 @@ class LLM:
lora_request
=
lora_request
,
pooling_params
=
pooling_params
,
truncate_prompt_tokens
=
truncate_prompt_tokens
,
pooling_task
=
"
encode
"
,
pooling_task
=
"
token_classify
"
,
)
def
_embedding_score
(
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment