Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5eb36575
Commit
5eb36575
authored
Apr 26, 2026
by
khluu
Browse files
Revert "[Frontend] Remove frontend pooling multi task support. (#37861)"
This reverts commit
d2e2e856
.
parent
4d51588e
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
228 additions
and
200 deletions
+228
-200
docs/models/pooling_models/README.md
docs/models/pooling_models/README.md
+3
-3
examples/pooling/token_embed/jina_embeddings_v4_offline.py
examples/pooling/token_embed/jina_embeddings_v4_offline.py
+51
-57
examples/pooling/token_embed/multi_vector_retrieval_offline.py
...les/pooling/token_embed/multi_vector_retrieval_offline.py
+9
-16
examples/pooling/token_embed/multi_vector_retrieval_online.py
...ples/pooling/token_embed/multi_vector_retrieval_online.py
+3
-17
tests/entrypoints/pooling/classify/test_offline.py
tests/entrypoints/pooling/classify/test_offline.py
+15
-4
tests/entrypoints/pooling/classify/test_online.py
tests/entrypoints/pooling/classify/test_online.py
+20
-4
tests/entrypoints/pooling/embed/test_offline.py
tests/entrypoints/pooling/embed/test_offline.py
+13
-6
tests/entrypoints/pooling/embed/test_online.py
tests/entrypoints/pooling/embed/test_online.py
+22
-5
tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
.../entrypoints/pooling/scoring/test_cross_encoder_online.py
+20
-3
tests/entrypoints/pooling/token_classify/test_offline.py
tests/entrypoints/pooling/token_classify/test_offline.py
+12
-8
tests/entrypoints/pooling/token_classify/test_online.py
tests/entrypoints/pooling/token_classify/test_online.py
+1
-4
tests/entrypoints/pooling/token_embed/test_offline.py
tests/entrypoints/pooling/token_embed/test_offline.py
+11
-7
tests/entrypoints/pooling/token_embed/test_online.py
tests/entrypoints/pooling/token_embed/test_online.py
+1
-4
tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
...language/pooling/test_all_pooling_plus_chunked_prefill.py
+0
-2
tests/models/language/pooling/test_bge_m3.py
tests/models/language/pooling/test_bge_m3.py
+14
-46
tests/models/language/pooling/test_extract_hidden_states.py
tests/models/language/pooling/test_extract_hidden_states.py
+11
-2
tests/models/language/pooling/test_multi_vector_retrieval.py
tests/models/language/pooling/test_multi_vector_retrieval.py
+0
-2
tests/models/language/pooling/test_pooler_config_init_behaviour.py
...els/language/pooling/test_pooler_config_init_behaviour.py
+2
-2
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+8
-5
vllm/entrypoints/pooling/base/serving.py
vllm/entrypoints/pooling/base/serving.py
+12
-3
No files found.
docs/models/pooling_models/README.md
View file @
5eb36575
...
@@ -292,10 +292,10 @@ Pooling models now support token-wise task.
...
@@ -292,10 +292,10 @@ Pooling models now support token-wise task.
### Score task
### Score task
`score`
task
have has
be
en
removed in v0.2
1,
use
`classify`
instead. Only when a
classification model outputs num_labels
`score`
task
is deprecated and will
be removed in v0.2
0. Please
use
`classify`
instead. Only when a
equal to 1 can it be used as a scoring model and have its scoring API enabled.
classification model outputs num_labels
equal to 1 can it be used as a scoring model and have its scoring API enabled.
### Pooling multitask support
### Pooling multitask support
Pooling multitask support
has been
removed in v0.2
1
. When the default pooling task is not what you want,
Pooling multitask support
is deprecated and will be
removed in v0.2
0
. When the default pooling task is not what you want,
you need to manually specify it via
`PoolerConfig(task=<task>)`
offline or
`--pooler-config.task <task>`
online.
you need to manually specify it via
`PoolerConfig(task=<task>)`
offline or
`--pooler-config.task <task>`
online.
examples/pooling/token_embed/jina_embeddings_v4_offline.py
View file @
5eb36575
...
@@ -4,74 +4,68 @@
...
@@ -4,74 +4,68 @@
import
torch
import
torch
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
PoolerConfig
from
vllm.inputs
import
TextPrompt
from
vllm.inputs
import
TextPrompt
from
vllm.multimodal.utils
import
fetch_image
from
vllm.multimodal.utils
import
fetch_image
# Initialize model
model
=
LLM
(
model
=
"jinaai/jina-embeddings-v4-vllm-text-matching"
,
runner
=
"pooling"
,
max_model_len
=
1024
,
gpu_memory_utilization
=
0.8
,
)
def
main
():
# Create text prompts
# Initialize model
text1
=
"Ein wunderschöner Sonnenuntergang am Strand"
model
=
LLM
(
text1_prompt
=
TextPrompt
(
prompt
=
f
"Query:
{
text1
}
"
)
model
=
"jinaai/jina-embeddings-v4-vllm-text-matching"
,
pooler_config
=
PoolerConfig
(
task
=
"token_embed"
),
runner
=
"pooling"
,
max_model_len
=
1024
,
gpu_memory_utilization
=
0.8
,
)
# Create text prompts
text2
=
"浜辺に沈む美しい夕日"
text1
=
"Ein wunderschöner Sonnenuntergang am Strand"
text2_prompt
=
TextPrompt
(
prompt
=
f
"Query:
{
text2
}
"
)
text1_prompt
=
TextPrompt
(
prompt
=
f
"Query:
{
text1
}
"
)
text2
=
"浜辺に沈む美しい夕日"
# Create image prompt
text2_prompt
=
TextPrompt
(
prompt
=
f
"Query:
{
text2
}
"
)
image
=
fetch_image
(
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"
# noqa: E501
)
image_prompt
=
TextPrompt
(
prompt
=
"<|im_start|>user
\n
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>
\n
"
,
# noqa: E501
multi_modal_data
=
{
"image"
:
image
},
)
# Create image prompt
# Encode all prompts
image
=
fetch_image
(
prompts
=
[
text1_prompt
,
text2_prompt
,
image_prompt
]
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"
# noqa: E501
outputs
=
model
.
encode
(
prompts
,
pooling_task
=
"token_embed"
)
)
image_prompt
=
TextPrompt
(
prompt
=
"<|im_start|>user
\n
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>
\n
"
,
# noqa: E501
multi_modal_data
=
{
"image"
:
image
},
)
# Encode all prompts
prompts
=
[
text1_prompt
,
text2_prompt
,
image_prompt
]
outputs
=
model
.
encode
(
prompts
,
pooling_task
=
"token_embed"
)
def
get_embeddings
(
outputs
):
def
get_embeddings
(
outputs
):
VISION_START_TOKEN_ID
,
VISION_END_TOKEN_ID
=
151652
,
151653
VISION_START_TOKEN_ID
,
VISION_END_TOKEN_ID
=
151652
,
151653
embeddings
=
[]
embeddings
=
[]
for
output
in
outputs
:
for
output
in
outputs
:
if
VISION_START_TOKEN_ID
in
output
.
prompt_token_ids
:
if
VISION_START_TOKEN_ID
in
output
.
prompt_token_ids
:
# Gather only vision tokens
# Gather only vision tokens
img_start_pos
=
torch
.
where
(
img_start_pos
=
torch
.
where
(
torch
.
tensor
(
output
.
prompt_token_ids
)
==
VISION_START_TOKEN_ID
torch
.
tensor
(
output
.
prompt_token_ids
)
==
VISION_START_TOKEN_ID
)[
0
][
0
]
)[
0
][
0
]
img_end_pos
=
torch
.
where
(
img_end_pos
=
torch
.
where
(
torch
.
tensor
(
output
.
prompt_token_ids
)
==
VISION_END_TOKEN_ID
torch
.
tensor
(
output
.
prompt_token_ids
)
==
VISION_END_TOKEN_ID
)[
0
][
0
]
)[
0
][
0
]
embeddings_tensor
=
output
.
outputs
.
data
.
detach
().
clone
()[
embeddings_tensor
=
output
.
outputs
.
data
.
detach
().
clone
()[
img_start_pos
:
img_end_pos
+
1
img_start_pos
:
img_end_pos
+
1
]
]
else
:
else
:
# Use all tokens for text-only prompts
# Use all tokens for text-only prompts
embeddings_tensor
=
output
.
outputs
.
data
.
detach
().
clone
()
embeddings_tensor
=
output
.
outputs
.
data
.
detach
().
clone
()
# Pool and normalize embeddings
# Pool and normalize embeddings
pooled_output
=
(
pooled_output
=
(
embeddings_tensor
.
sum
(
dim
=
0
,
dtype
=
torch
.
float32
)
embeddings_tensor
.
sum
(
dim
=
0
,
dtype
=
torch
.
float32
)
/
embeddings_tensor
.
shape
[
0
]
/
embeddings_tensor
.
shape
[
0
]
)
)
embeddings
.
append
(
torch
.
nn
.
functional
.
normalize
(
pooled_output
,
dim
=-
1
))
embeddings
.
append
(
torch
.
nn
.
functional
.
normalize
(
pooled_output
,
dim
=-
1
))
return
embeddings
return
embeddings
embeddings
=
get_embeddings
(
outputs
)
for
embedding
in
embeddings
:
embeddings
=
get_embeddings
(
outputs
)
print
(
embedding
.
shape
)
for
embedding
in
embeddings
:
if
__name__
==
"__main__"
:
print
(
embedding
.
shape
)
main
()
examples/pooling/token_embed/multi_vector_retrieval_offline.py
View file @
5eb36575
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
from
argparse
import
Namespace
from
argparse
import
Namespace
from
vllm
import
LLM
,
EngineArgs
from
vllm
import
LLM
,
EngineArgs
from
vllm.config
import
PoolerConfig
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
from
vllm.utils.argparse_utils
import
FlexibleArgumentParser
...
@@ -14,7 +13,6 @@ def parse_args():
...
@@ -14,7 +13,6 @@ def parse_args():
# Set example specific arguments
# Set example specific arguments
parser
.
set_defaults
(
parser
.
set_defaults
(
model
=
"BAAI/bge-m3"
,
model
=
"BAAI/bge-m3"
,
pooler_config
=
PoolerConfig
(
task
=
"token_embed"
),
runner
=
"pooling"
,
runner
=
"pooling"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
)
)
...
@@ -34,6 +32,15 @@ def main(args: Namespace):
...
@@ -34,6 +32,15 @@ def main(args: Namespace):
# You should pass runner="pooling" for embedding models
# You should pass runner="pooling" for embedding models
llm
=
LLM
(
**
vars
(
args
))
llm
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs
=
llm
.
embed
(
prompts
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
print
(
len
(
embeds
))
# Generate embedding for each token. The output is a list of PoolingRequestOutput.
# Generate embedding for each token. The output is a list of PoolingRequestOutput.
outputs
=
llm
.
encode
(
prompts
,
pooling_task
=
"token_embed"
)
outputs
=
llm
.
encode
(
prompts
,
pooling_task
=
"token_embed"
)
...
@@ -43,20 +50,6 @@ def main(args: Namespace):
...
@@ -43,20 +50,6 @@ def main(args: Namespace):
multi_vector
=
output
.
outputs
.
data
multi_vector
=
output
.
outputs
.
data
print
(
multi_vector
.
shape
)
print
(
multi_vector
.
shape
)
query
=
"What is the capital of France?"
documents
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
,
]
# Generate scores.
outputs
=
llm
.
score
(
query
,
documents
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
document
,
output
in
zip
(
documents
,
outputs
):
score
=
output
.
outputs
.
score
print
(
f
"Pair:
{
[
query
,
document
]
!
r
}
\n
Score:
{
score
}
"
)
print
(
"-"
*
60
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
args
=
parse_args
()
args
=
parse_args
()
...
...
examples/pooling/token_embed/multi_vector_retrieval_online.py
View file @
5eb36575
...
@@ -7,11 +7,10 @@ Example online usage of Pooling API for multi vector retrieval.
...
@@ -7,11 +7,10 @@ Example online usage of Pooling API for multi vector retrieval.
Run `vllm serve <model> --runner pooling`
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
to start up the server in vLLM. e.g.
vllm serve BAAI/bge-m3
--pooler-config.task token_embed
vllm serve BAAI/bge-m3
"""
"""
import
argparse
import
argparse
import
pprint
import
requests
import
requests
import
torch
import
torch
...
@@ -33,8 +32,7 @@ def parse_args():
...
@@ -33,8 +32,7 @@ def parse_args():
def
main
(
args
):
def
main
(
args
):
pooling_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
score_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/score"
model_name
=
args
.
model
model_name
=
args
.
model
prompts
=
[
prompts
=
[
...
@@ -45,23 +43,11 @@ def main(args):
...
@@ -45,23 +43,11 @@ def main(args):
]
]
prompt
=
{
"model"
:
model_name
,
"input"
:
prompts
}
prompt
=
{
"model"
:
model_name
,
"input"
:
prompts
}
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
pooling
_url
)
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api
_url
)
for
output
in
pooling_response
.
json
()[
"data"
]:
for
output
in
pooling_response
.
json
()[
"data"
]:
multi_vector
=
torch
.
tensor
(
output
[
"data"
])
multi_vector
=
torch
.
tensor
(
output
[
"data"
])
print
(
multi_vector
.
shape
)
print
(
multi_vector
.
shape
)
queries
=
"What is the capital of France?"
documents
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
,
]
prompt
=
{
"model"
:
model_name
,
"queries"
:
queries
,
"documents"
:
documents
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
score_url
)
print
(
"
\n
Prompt when queries is string and documents is a list:"
)
pprint
.
pprint
(
prompt
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
args
=
parse_args
()
args
=
parse_args
()
...
...
tests/entrypoints/pooling/classify/test_offline.py
View file @
5eb36575
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
logging
import
weakref
import
weakref
import
pytest
import
pytest
import
torch
import
torch
from
tests.models.utils
import
softmax
from
tests.models.utils
import
softmax
from
vllm
import
LLM
,
ClassificationRequestOutput
,
PoolingParams
from
vllm
import
LLM
,
ClassificationRequestOutput
,
PoolingParams
,
PoolingRequestOutput
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.tasks
import
PoolingTask
from
vllm.tasks
import
PoolingTask
...
@@ -65,6 +66,18 @@ def test_list_prompts(llm: LLM):
...
@@ -65,6 +66,18 @@ def test_list_prompts(llm: LLM):
assert
len
(
outputs
[
i
].
outputs
.
probs
)
==
num_labels
assert
len
(
outputs
[
i
].
outputs
.
probs
)
==
num_labels
@
pytest
.
mark
.
skip_global_cleanup
def
test_token_classify
(
llm
:
LLM
,
caplog_vllm
):
with
caplog_vllm
.
at_level
(
level
=
logging
.
WARNING
,
logger
=
"vllm"
):
outputs
=
llm
.
encode
(
prompt
,
pooling_task
=
"token_classify"
,
use_tqdm
=
False
)
assert
"deprecated"
in
caplog_vllm
.
text
assert
len
(
outputs
)
==
1
assert
isinstance
(
outputs
[
0
],
PoolingRequestOutput
)
assert
outputs
[
0
].
prompt_token_ids
==
prompt_token_ids
assert
outputs
[
0
].
outputs
.
data
.
shape
==
(
len
(
prompt_token_ids
),
num_labels
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_pooling_params
(
llm
:
LLM
):
def
test_pooling_params
(
llm
:
LLM
):
def
get_outputs
(
use_activation
):
def
get_outputs
(
use_activation
):
...
@@ -97,12 +110,10 @@ def test_score_api(llm: LLM):
...
@@ -97,12 +110,10 @@ def test_score_api(llm: LLM):
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"token_classify"
,
"plugin"
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"plugin"
])
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
):
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
):
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"token_classify"
:
err_msg
=
"Try switching the model's pooling_task via.+"
else
:
else
:
err_msg
=
"Embedding API is not supported by this model.+"
err_msg
=
"Embedding API is not supported by this model.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
...
...
tests/entrypoints/pooling/classify/test_online.py
View file @
5eb36575
...
@@ -436,7 +436,26 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
...
@@ -436,7 +436,26 @@ async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"token_classify"
,
"plugin"
])
async
def
test_pooling_token_classify
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
task
=
"token_classify"
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
{
"model"
:
model_name
,
"input"
:
input_text
,
"encoding_format"
:
"float"
,
"task"
:
task
,
},
)
poolings
=
PoolingResponse
.
model_validate
(
response
.
json
())
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
8
assert
len
(
poolings
.
data
[
0
].
data
[
0
])
==
2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"plugin"
])
async
def
test_pooling_not_supported
(
async
def
test_pooling_not_supported
(
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
):
):
...
@@ -450,11 +469,8 @@ async def test_pooling_not_supported(
...
@@ -450,11 +469,8 @@ async def test_pooling_not_supported(
},
},
)
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"token_classify"
:
err_msg
=
"Try switching the model's pooling_task via"
else
:
else
:
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
tests/entrypoints/pooling/embed/test_offline.py
View file @
5eb36575
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
logging
import
weakref
import
weakref
import
pytest
import
pytest
...
@@ -37,11 +38,11 @@ def llm():
...
@@ -37,11 +38,11 @@ def llm():
seed
=
0
,
seed
=
0
,
attention_config
=
attention_config
,
attention_config
=
attention_config
,
)
)
assert
embedding_size
==
llm
.
model_config
.
embedding_size
yield
weakref
.
proxy
(
llm
)
yield
weakref
.
proxy
(
llm
)
del
llm
del
llm
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
...
@@ -73,6 +74,16 @@ def test_list_prompts(llm: LLM):
...
@@ -73,6 +74,16 @@ def test_list_prompts(llm: LLM):
assert
len
(
outputs
[
i
].
outputs
.
embedding
)
==
embedding_size
assert
len
(
outputs
[
i
].
outputs
.
embedding
)
==
embedding_size
@
pytest
.
mark
.
skip_global_cleanup
def
test_token_embed
(
llm
:
LLM
,
caplog_vllm
):
with
caplog_vllm
.
at_level
(
level
=
logging
.
WARNING
,
logger
=
"vllm"
):
outputs
=
llm
.
encode
(
prompt
,
pooling_task
=
"token_embed"
,
use_tqdm
=
False
)
assert
"deprecated"
in
caplog_vllm
.
text
multi_vector
=
outputs
[
0
].
outputs
.
data
assert
multi_vector
.
shape
==
(
11
,
384
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_pooling_params
(
llm
:
LLM
):
def
test_pooling_params
(
llm
:
LLM
):
def
get_outputs
(
normalize
):
def
get_outputs
(
normalize
):
...
@@ -96,14 +107,10 @@ def test_pooling_params(llm: LLM):
...
@@ -96,14 +107,10 @@ def test_pooling_params(llm: LLM):
)
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"token_classify"
,
"classify"
,
"plugin"
])
"task"
,
[
"token_classify"
,
"classify"
,
"token_embed"
,
"plugin"
]
)
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
):
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
):
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"token_embed"
:
err_msg
=
"Try switching the model's pooling_task via.+"
else
:
else
:
err_msg
=
"Classification API is not supported by this model.+"
err_msg
=
"Classification API is not supported by this model.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
...
...
tests/entrypoints/pooling/embed/test_online.py
View file @
5eb36575
...
@@ -732,9 +732,28 @@ async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
...
@@ -732,9 +732,28 @@ async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
async
def
test_pooling_token_embed
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
"task"
,
[
"classify"
,
"token_classify"
,
"token_embed"
,
"plugin"
]
task
=
"token_embed"
)
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
{
"model"
:
model_name
,
"input"
:
input_text
,
"encoding_format"
:
"float"
,
"task"
:
task
,
},
)
poolings
=
PoolingResponse
.
model_validate
(
response
.
json
())
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
len
(
input_tokens
)
assert
len
(
poolings
.
data
[
0
].
data
[
0
])
==
384
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"classify"
,
"token_classify"
,
"plugin"
])
async
def
test_pooling_not_supported
(
async
def
test_pooling_not_supported
(
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
):
):
...
@@ -750,8 +769,6 @@ async def test_pooling_not_supported(
...
@@ -750,8 +769,6 @@ async def test_pooling_not_supported(
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"token_embed"
:
err_msg
=
"Try switching the model's pooling_task via"
else
:
else
:
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
View file @
5eb36575
...
@@ -452,6 +452,25 @@ async def test_pooling_classify(server: RemoteOpenAIServer):
...
@@ -452,6 +452,25 @@ async def test_pooling_classify(server: RemoteOpenAIServer):
assert
len
(
poolings
.
data
[
0
].
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
1
@
pytest
.
mark
.
asyncio
async
def
test_pooling_token_classify
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
{
"model"
:
MODEL_NAME
,
"task"
:
"token_classify"
,
"input"
:
input_text
,
"encoding_format"
:
"float"
,
},
)
poolings
=
PoolingResponse
.
model_validate
(
response
.
json
())
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
len
(
input_tokens
)
assert
len
(
poolings
.
data
[
0
].
data
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_rerank_max_tokens_per_doc
(
async
def
test_rerank_max_tokens_per_doc
(
server
:
RemoteOpenAIServer
,
server
:
RemoteOpenAIServer
,
...
@@ -525,7 +544,7 @@ async def test_rerank_max_tokens_per_doc_validation(
...
@@ -525,7 +544,7 @@ async def test_rerank_max_tokens_per_doc_validation(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"token_classify"
,
"plugin"
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"plugin"
])
async
def
test_pooling_not_supported
(
server
:
RemoteOpenAIServer
,
task
:
str
):
async
def
test_pooling_not_supported
(
server
:
RemoteOpenAIServer
,
task
:
str
):
response
=
requests
.
post
(
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
server
.
url_for
(
"pooling"
),
...
@@ -539,8 +558,6 @@ async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str):
...
@@ -539,8 +558,6 @@ async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str):
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"token_classify"
:
err_msg
=
"Try switching the model's pooling_task via"
else
:
else
:
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
tests/entrypoints/pooling/token_classify/test_offline.py
View file @
5eb36575
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
logging
import
weakref
import
weakref
import
pytest
import
pytest
...
@@ -59,19 +60,22 @@ def test_token_ids_prompts(llm: LLM):
...
@@ -59,19 +60,22 @@ def test_token_ids_prompts(llm: LLM):
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_score_api
(
llm
:
LLM
):
def
test_score_api
(
llm
:
LLM
):
err_msg
=
"
This model does not support the Scoring API
."
err_msg
=
"
Scoring API is only enabled for num_labels == 1
."
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"classify"
,
"embed"
,
"token_embed"
,
"plugin"
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"classify"
,
"embed"
,
"token_embed"
,
"plugin"
])
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
,
caplog_vllm
):
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
,
caplog_vllm
):
if
task
==
"
plugin
"
:
if
task
==
"
classify
"
:
err_msg
=
"No IOProcessor plugin installed."
with
caplog_vllm
.
at_level
(
level
=
logging
.
WARNING
,
logger
=
"vllm"
):
elif
task
==
"classify"
:
llm
.
encode
(
prompt
,
pooling_task
=
task
,
use_tqdm
=
False
)
err_msg
=
"Try switching the model's pooling_task via.+"
assert
"deprecated"
in
caplog_vllm
.
text
else
:
else
:
err_msg
=
"Embedding API is not supported by this model.+"
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
else
:
err_msg
=
"Embedding API is not supported by this model.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
encode
(
prompt
,
pooling_task
=
task
,
use_tqdm
=
False
)
llm
.
encode
(
prompt
,
pooling_task
=
task
,
use_tqdm
=
False
)
tests/entrypoints/pooling/token_classify/test_online.py
View file @
5eb36575
...
@@ -50,7 +50,7 @@ async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: st
...
@@ -50,7 +50,7 @@ async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: st
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"classify"
,
"embed"
,
"token_embed"
,
"plugin"
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"token_embed"
,
"plugin"
])
async
def
test_pooling_not_supported
(
async
def
test_pooling_not_supported
(
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
):
):
...
@@ -63,12 +63,9 @@ async def test_pooling_not_supported(
...
@@ -63,12 +63,9 @@ async def test_pooling_not_supported(
"task"
:
task
,
"task"
:
task
,
},
},
)
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"classify"
:
err_msg
=
"Try switching the model's pooling_task via"
else
:
else
:
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
tests/entrypoints/pooling/token_embed/test_offline.py
View file @
5eb36575
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
logging
import
weakref
import
weakref
import
pytest
import
pytest
...
@@ -63,12 +64,15 @@ def test_token_ids_prompts(llm: LLM):
...
@@ -63,12 +64,15 @@ def test_token_ids_prompts(llm: LLM):
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"classify"
,
"token_classify"
,
"plugin"
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"classify"
,
"token_classify"
,
"plugin"
])
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
,
caplog_vllm
):
def
test_unsupported_tasks
(
llm
:
LLM
,
task
:
PoolingTask
,
caplog_vllm
):
if
task
==
"
plugin
"
:
if
task
==
"
embed
"
:
err_msg
=
"No IOProcessor plugin installed."
with
caplog_vllm
.
at_level
(
level
=
logging
.
WARNING
,
logger
=
"vllm"
):
elif
task
==
"embed"
:
llm
.
encode
(
prompt
,
pooling_task
=
task
,
use_tqdm
=
False
)
err_msg
=
"Try switching the model's pooling_task via.+"
assert
"deprecated"
in
caplog_vllm
.
text
else
:
else
:
err_msg
=
"Classification API is not supported by this model.+"
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
else
:
err_msg
=
"Classification API is not supported by this model.+"
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
encode
(
prompt
,
pooling_task
=
task
,
use_tqdm
=
False
)
llm
.
encode
(
prompt
,
pooling_task
=
task
,
use_tqdm
=
False
)
tests/entrypoints/pooling/token_embed/test_online.py
View file @
5eb36575
...
@@ -73,7 +73,7 @@ async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
...
@@ -73,7 +73,7 @@ async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"embed"
,
"classify"
,
"token_classify"
,
"plugin"
])
@
pytest
.
mark
.
parametrize
(
"task"
,
[
"classify"
,
"token_classify"
,
"plugin"
])
async
def
test_pooling_not_supported
(
async
def
test_pooling_not_supported
(
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
server
:
RemoteOpenAIServer
,
model_name
:
str
,
task
:
str
):
):
...
@@ -86,12 +86,9 @@ async def test_pooling_not_supported(
...
@@ -86,12 +86,9 @@ async def test_pooling_not_supported(
"task"
:
task
,
"task"
:
task
,
},
},
)
)
assert
response
.
json
()[
"error"
][
"type"
]
==
"BadRequestError"
if
task
==
"plugin"
:
if
task
==
"plugin"
:
err_msg
=
"No IOProcessor plugin installed."
err_msg
=
"No IOProcessor plugin installed."
elif
task
==
"embed"
:
err_msg
=
"Try switching the model's pooling_task via"
else
:
else
:
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
err_msg
=
f
"Unsupported task:
{
task
!
r
}
"
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
assert
response
.
json
()[
"error"
][
"message"
].
startswith
(
err_msg
)
tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
View file @
5eb36575
...
@@ -6,7 +6,6 @@ from transformers import AutoModel
...
@@ -6,7 +6,6 @@ from transformers import AutoModel
from
tests.models.utils
import
check_embeddings_close
from
tests.models.utils
import
check_embeddings_close
from
vllm
import
TokensPrompt
from
vllm
import
TokensPrompt
from
vllm.config
import
PoolerConfig
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -22,7 +21,6 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
...
@@ -22,7 +21,6 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
runner
=
"pooling"
,
runner
=
"pooling"
,
pooler_config
=
PoolerConfig
(
task
=
"token_embed"
),
max_model_len
=
128
,
max_model_len
=
128
,
max_num_batched_tokens
=
chunk_size
,
max_num_batched_tokens
=
chunk_size
,
enforce_eager
=
True
,
enforce_eager
=
True
,
...
...
tests/models/language/pooling/test_bge_m3.py
View file @
5eb36575
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
httpx
import
httpx
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
import
torch
import
torch
from
....utils
import
RemoteOpenAIServer
from
....utils
import
RemoteOpenAIServer
...
@@ -24,42 +25,29 @@ sentences_2 = [
...
@@ -24,42 +25,29 @@ sentences_2 = [
similarity_reference
=
[[
0.6259
,
0.3474
],
[
0.3309
,
0.6734
]]
similarity_reference
=
[[
0.6259
,
0.3474
],
[
0.3309
,
0.6734
]]
lexical_score_reference
=
[
0.19554901123046875
,
0.0
]
lexical_score_reference
=
[
0.19554901123046875
,
0.0
]
colbert_score_reference
=
[
0.7797
,
0.4620
]
colbert_score_reference
=
[
0.7797
,
0.4620
]
SUPPORTED_TASKS
=
[
"embed"
,
"token_embed"
,
"token_classify"
]
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
SUPPORTED_TASKS
)
def
pooling_task
(
request
):
yield
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
pooling_task
):
def
server
():
args
=
[
args
=
[
"--max-model-len"
,
"--max-model-len"
,
str
(
MAX_MODEL_LEN
),
str
(
MAX_MODEL_LEN
),
"--hf-overrides"
,
"--hf-overrides"
,
'{"architectures": ["BgeM3EmbeddingModel"]}'
,
'{"architectures": ["BgeM3EmbeddingModel"]}'
,
"--pooler-config.task"
,
pooling_task
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest_asyncio
.
fixture
async
def
test_bge_m3_api_server_embedding
(
server
,
pooling_task
):
async
def
client
(
server
):
client
=
server
.
get_async_client
()
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
if
pooling_task
!=
"embed"
:
with
pytest
.
raises
(
openai
.
InternalServerError
):
await
run_client_embeddings
(
client
,
MODEL_NAME
,
sentences_1
,
)
return
@
pytest
.
mark
.
asyncio
async
def
test_bge_m3_api_server_embedding
(
client
:
openai
.
AsyncOpenAI
):
embeddings_list_1
=
await
run_client_embeddings
(
embeddings_list_1
=
await
run_client_embeddings
(
client
,
client
,
MODEL_NAME
,
MODEL_NAME
,
...
@@ -129,14 +117,7 @@ def compute_lexical_matching_score(
...
@@ -129,14 +117,7 @@ def compute_lexical_matching_score(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_bge_m3_api_server_sparse_embedding
(
server
,
pooling_task
):
async
def
test_bge_m3_api_server_sparse_embedding
(
client
:
openai
.
AsyncOpenAI
):
client
=
server
.
get_async_client
()
if
pooling_task
!=
"token_classify"
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
sparse_embeddings
(
client
,
sentences_1
)
return
embeddings_1
=
await
sparse_embeddings
(
client
,
sentences_1
)
embeddings_1
=
await
sparse_embeddings
(
client
,
sentences_1
)
embeddings_2
=
await
sparse_embeddings
(
client
,
sentences_2
)
embeddings_2
=
await
sparse_embeddings
(
client
,
sentences_2
)
...
@@ -156,11 +137,9 @@ async def test_bge_m3_api_server_sparse_embedding(server, pooling_task):
...
@@ -156,11 +137,9 @@ async def test_bge_m3_api_server_sparse_embedding(server, pooling_task):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_bge_m3_api_server_sparse_embedding_corner_case
(
server
,
pooling_task
):
async
def
test_bge_m3_api_server_sparse_embedding_corner_case
(
if
pooling_task
!=
"token_classify"
:
client
:
openai
.
AsyncOpenAI
,
return
):
client
=
server
.
get_async_client
()
embeddings
=
await
sparse_embeddings
(
client
,
[
"Hi"
])
embeddings
=
await
sparse_embeddings
(
client
,
[
"Hi"
])
assert
len
(
embeddings
)
==
1
assert
len
(
embeddings
)
==
1
assert
2673
in
embeddings
[
0
]
assert
2673
in
embeddings
[
0
]
...
@@ -176,18 +155,7 @@ def colbert_score(q_reps: torch.Tensor, p_reps: torch.Tensor) -> torch.Tensor:
...
@@ -176,18 +155,7 @@ def colbert_score(q_reps: torch.Tensor, p_reps: torch.Tensor) -> torch.Tensor:
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_bge_m3_api_server_multi_vector
(
server
,
pooling_task
):
async
def
test_bge_m3_api_server_multi_vector
(
client
:
openai
.
AsyncOpenAI
):
client
=
server
.
get_async_client
()
if
pooling_task
!=
"token_embed"
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
post
(
"../pooling"
,
body
=
{
"model"
:
MODEL_NAME
,
"input"
:
sentences_1
,
"task"
:
"token_embed"
},
cast_to
=
httpx
.
Response
,
)
return
result_1
=
await
client
.
post
(
result_1
=
await
client
.
post
(
"../pooling"
,
"../pooling"
,
body
=
{
"model"
:
MODEL_NAME
,
"input"
:
sentences_1
,
"task"
:
"token_embed"
},
body
=
{
"model"
:
MODEL_NAME
,
"input"
:
sentences_1
,
"task"
:
"token_embed"
},
...
...
tests/models/language/pooling/test_extract_hidden_states.py
View file @
5eb36575
...
@@ -4,7 +4,6 @@ import pytest
...
@@ -4,7 +4,6 @@ import pytest
import
torch
import
torch
from
vllm
import
TokensPrompt
from
vllm
import
TokensPrompt
from
vllm.config
import
PoolerConfig
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -21,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
...
@@ -21,7 +20,6 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
max_model_len
=
128
,
max_model_len
=
128
,
enforce_eager
=
True
,
enforce_eager
=
True
,
runner
=
"pooling"
,
runner
=
"pooling"
,
pooler_config
=
PoolerConfig
(
task
=
"token_embed"
),
enable_prefix_caching
=
True
,
enable_prefix_caching
=
True
,
)
as
vllm_model
:
)
as
vllm_model
:
pooling_outputs
=
vllm_model
.
llm
.
encode
(
pooling_outputs
=
vllm_model
.
llm
.
encode
(
...
@@ -46,3 +44,14 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
...
@@ -46,3 +44,14 @@ def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
assert
len
(
output
.
prompt_token_ids
)
==
n
assert
len
(
output
.
prompt_token_ids
)
==
n
assert
len
(
output
.
outputs
.
data
)
==
n
assert
len
(
output
.
outputs
.
data
)
==
n
assert
output
.
num_cached_tokens
==
0
assert
output
.
num_cached_tokens
==
0
# skip_reading_prefix_cache can still write to cache
# to accelerate following requests
pooling_outputs
=
vllm_model
.
llm
.
encode
(
[
TokensPrompt
(
prompt_token_ids
=
t
)
for
t
in
token_prompts
],
pooling_task
=
"embed"
,
)
for
n
,
output
in
zip
(
n_prompt_tokens
,
pooling_outputs
):
assert
len
(
output
.
prompt_token_ids
)
==
n
assert
output
.
num_cached_tokens
>
0
tests/models/language/pooling/test_multi_vector_retrieval.py
View file @
5eb36575
...
@@ -5,7 +5,6 @@ import torch
...
@@ -5,7 +5,6 @@ import torch
from
transformers
import
AutoModel
from
transformers
import
AutoModel
from
tests.models.utils
import
check_embeddings_close
from
tests.models.utils
import
check_embeddings_close
from
vllm.config
import
PoolerConfig
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -18,7 +17,6 @@ def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype
...
@@ -18,7 +17,6 @@ def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
runner
=
"pooling"
,
runner
=
"pooling"
,
pooler_config
=
PoolerConfig
(
task
=
"token_embed"
),
max_model_len
=
None
,
max_model_len
=
None
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_embed
(
example_prompts
)
vllm_outputs
=
vllm_model
.
token_embed
(
example_prompts
)
...
...
tests/models/language/pooling/test_pooler_config_init_behaviour.py
View file @
5eb36575
...
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
...
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model
,
model
,
max_model_len
=
512
,
max_model_len
=
512
,
dtype
=
dtype
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
use_activation
=
False
,
task
=
"token_embed"
),
pooler_config
=
PoolerConfig
(
use_activation
=
False
),
)
as
vllm_model
:
)
as
vllm_model
:
wo_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
wo_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
...
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
...
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model
,
model
,
max_model_len
=
512
,
max_model_len
=
512
,
dtype
=
dtype
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
use_activation
=
True
,
task
=
"token_embed"
),
pooler_config
=
PoolerConfig
(
use_activation
=
True
),
)
as
vllm_model
:
)
as
vllm_model
:
w_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
w_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
...
...
vllm/entrypoints/llm.py
View file @
5eb36575
...
@@ -79,7 +79,7 @@ from vllm.renderers.inputs.preprocess import (
...
@@ -79,7 +79,7 @@ from vllm.renderers.inputs.preprocess import (
prompt_to_seq
,
prompt_to_seq
,
)
)
from
vllm.sampling_params
import
BeamSearchParams
,
RequestOutputKind
,
SamplingParams
from
vllm.sampling_params
import
BeamSearchParams
,
RequestOutputKind
,
SamplingParams
from
vllm.tasks
import
SCORE_TYPE_MAP
,
PoolingTask
from
vllm.tasks
import
PoolingTask
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils.counter
import
Counter
from
vllm.utils.counter
import
Counter
...
@@ -1204,9 +1204,12 @@ class LLM:
...
@@ -1204,9 +1204,12 @@ class LLM:
f
"Supported tasks:
{
self
.
supported_tasks
}
"
f
"Supported tasks:
{
self
.
supported_tasks
}
"
)
)
else
:
else
:
raise
ValueError
(
logger
.
warning_once
(
f
"Try switching the model's pooling_task "
"Pooling multitask support is deprecated and will "
f
'via `PoolerConfig(task="
{
pooling_task
}
")`'
"be removed in v0.20. When the default pooling task is "
"not what you want, you need to manually specify it "
'via PoolerConfig(task="%s"). '
,
pooling_task
,
)
)
if
pooling_task
==
"plugin"
and
"plugin"
not
in
self
.
pooling_io_processors
:
if
pooling_task
==
"plugin"
and
"plugin"
not
in
self
.
pooling_io_processors
:
...
@@ -1409,7 +1412,7 @@ class LLM:
...
@@ -1409,7 +1412,7 @@ class LLM:
"pooling model."
"pooling model."
)
)
score_type
:
str
|
None
=
SCORE_TYPE_MAP
.
get
(
self
.
pooling_task
,
None
)
# type: ignore[arg-
type
]
score_type
=
self
.
model_config
.
score_
type
if
(
if
(
score_type
==
"cross-encoder"
score_type
==
"cross-encoder"
and
getattr
(
self
.
model_config
.
hf_config
,
"num_labels"
,
0
)
!=
1
and
getattr
(
self
.
model_config
.
hf_config
,
"num_labels"
,
0
)
!=
1
...
...
vllm/entrypoints/pooling/base/serving.py
View file @
5eb36575
...
@@ -15,7 +15,10 @@ from starlette.datastructures import Headers
...
@@ -15,7 +15,10 @@ from starlette.datastructures import Headers
from
vllm
import
PoolingParams
,
PoolingRequestOutput
,
envs
from
vllm
import
PoolingParams
,
PoolingRequestOutput
,
envs
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
ChatTemplateConfig
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateConfig
,
ChatTemplateContentFormatOption
,
)
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
...
@@ -45,7 +48,9 @@ class PoolingServingBase(ABC):
...
@@ -45,7 +48,9 @@ class PoolingServingBase(ABC):
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
*
,
*
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
chat_template_config
:
ChatTemplateConfig
,
chat_template
:
str
|
None
=
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
=
"auto"
,
trust_request_chat_template
:
bool
=
False
,
return_tokens_as_token_ids
:
bool
=
False
,
return_tokens_as_token_ids
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
):
):
...
@@ -58,7 +63,11 @@ class PoolingServingBase(ABC):
...
@@ -58,7 +63,11 @@ class PoolingServingBase(ABC):
self
.
request_logger
=
request_logger
self
.
request_logger
=
request_logger
self
.
return_tokens_as_token_ids
=
return_tokens_as_token_ids
self
.
return_tokens_as_token_ids
=
return_tokens_as_token_ids
self
.
log_error_stack
=
log_error_stack
self
.
log_error_stack
=
log_error_stack
self
.
chat_template_config
=
chat_template_config
self
.
chat_template_config
=
ChatTemplateConfig
(
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
trust_request_chat_template
=
trust_request_chat_template
,
)
# Shared thread pool executor for preprocessing and postprocessing.
# Shared thread pool executor for preprocessing and postprocessing.
self
.
_executor
:
Executor
=
models
.
renderer
.
_executor
self
.
_executor
:
Executor
=
models
.
renderer
.
_executor
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment