Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d9d21eb8
Unverified
Commit
d9d21eb8
authored
Mar 31, 2026
by
wang.yuqi
Committed by
GitHub
Mar 31, 2026
Browse files
[Frontend][3/n] Improve pooling entrypoints | scoring. (#28631)
Signed-off-by:
wang.yuqi
<
yuqi.wang@daocloud.io
>
parent
f09daea2
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
403 additions
and
626 deletions
+403
-626
tests/entrypoints/openai/utils.py
tests/entrypoints/openai/utils.py
+1
-3
tests/entrypoints/pooling/classify/test_offline.py
tests/entrypoints/pooling/classify/test_offline.py
+1
-1
tests/entrypoints/pooling/classify/test_online.py
tests/entrypoints/pooling/classify/test_online.py
+2
-2
tests/entrypoints/pooling/scoring/test_bi_encoder_online.py
tests/entrypoints/pooling/scoring/test_bi_encoder_online.py
+1
-1
tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
.../entrypoints/pooling/scoring/test_cross_encoder_online.py
+1
-1
tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
...oints/pooling/scoring/test_cross_encoder_online_vision.py
+1
-1
tests/entrypoints/pooling/scoring/test_late_interaction_offline_vision.py
...s/pooling/scoring/test_late_interaction_offline_vision.py
+93
-0
tests/entrypoints/pooling/scoring/test_late_interaction_online.py
...trypoints/pooling/scoring/test_late_interaction_online.py
+1
-1
tests/entrypoints/pooling/scoring/test_late_interaction_online_vision.py
...ts/pooling/scoring/test_late_interaction_online_vision.py
+193
-0
tests/entrypoints/pooling/scoring/test_utils.py
tests/entrypoints/pooling/scoring/test_utils.py
+0
-353
tests/entrypoints/pooling/scoring/util.py
tests/entrypoints/pooling/scoring/util.py
+39
-1
tests/entrypoints/pooling/token_classify/test_offline.py
tests/entrypoints/pooling/token_classify/test_offline.py
+1
-1
tests/models/language/pooling/test_colbert.py
tests/models/language/pooling/test_colbert.py
+1
-1
tests/models/multimodal/pooling/test_colmodernvbert.py
tests/models/multimodal/pooling/test_colmodernvbert.py
+1
-1
tests/models/multimodal/pooling/test_colpali.py
tests/models/multimodal/pooling/test_colpali.py
+2
-2
tests/models/multimodal/pooling/test_colqwen3.py
tests/models/multimodal/pooling/test_colqwen3.py
+2
-2
tests/models/multimodal/pooling/test_colqwen3_5.py
tests/models/multimodal/pooling/test_colqwen3_5.py
+1
-1
tests/models/multimodal/pooling/test_jinavl_reranker.py
tests/models/multimodal/pooling/test_jinavl_reranker.py
+1
-1
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
+1
-1
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+60
-252
No files found.
tests/entrypoints/openai/utils.py
View file @
d9d21eb8
...
@@ -10,9 +10,7 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
...
@@ -10,9 +10,7 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionStreamResponse
,
ChatCompletionStreamResponse
,
ChatMessage
,
ChatMessage
,
)
)
from
vllm.entrypoints.openai.engine.protocol
import
(
from
vllm.entrypoints.openai.engine.protocol
import
UsageInfo
UsageInfo
,
)
async
def
accumulate_streaming_response
(
async
def
accumulate_streaming_response
(
...
...
tests/entrypoints/pooling/classify/test_offline.py
View file @
d9d21eb8
...
@@ -105,7 +105,7 @@ def test_pooling_params(llm: LLM):
...
@@ -105,7 +105,7 @@ def test_pooling_params(llm: LLM):
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_score_api
(
llm
:
LLM
):
def
test_score_api
(
llm
:
LLM
):
err_msg
=
"Scor
e
API is only enabled for num_labels == 1."
err_msg
=
"Scor
ing
API is only enabled for num_labels == 1."
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
...
...
tests/entrypoints/pooling/classify/test_online.py
View file @
d9d21eb8
...
@@ -390,7 +390,7 @@ async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
...
@@ -390,7 +390,7 @@ async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_score
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
async
def
test_score
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
#
s
cor
e api
is only enabled for num_labels == 1.
#
S
cor
ing API
is only enabled for num_labels == 1.
response
=
requests
.
post
(
response
=
requests
.
post
(
server
.
url_for
(
"score"
),
server
.
url_for
(
"score"
),
json
=
{
json
=
{
...
@@ -405,7 +405,7 @@ async def test_score(server: RemoteOpenAIServer, model_name: str):
...
@@ -405,7 +405,7 @@ async def test_score(server: RemoteOpenAIServer, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_rerank
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
async
def
test_rerank
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
#
rerank api
is only enabled for num_labels == 1.
#
Scoring API
is only enabled for num_labels == 1.
response
=
requests
.
post
(
response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
server
.
url_for
(
"rerank"
),
json
=
{
json
=
{
...
...
tests/entrypoints/pooling/scoring/test_bi_encoder_online.py
View file @
d9d21eb8
...
@@ -7,7 +7,7 @@ import requests
...
@@ -7,7 +7,7 @@ import requests
from
tests.entrypoints.pooling.scoring.util
import
EncoderScoringHfRunner
from
tests.entrypoints.pooling.scoring.util
import
EncoderScoringHfRunner
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.pooling.pooling.protocol
import
PoolingResponse
from
vllm.entrypoints.pooling.pooling.protocol
import
PoolingResponse
from
vllm.entrypoints.pooling.scor
e
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.entrypoints.pooling.scor
ing
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
MODEL_NAME
=
"BAAI/bge-base-en-v1.5"
MODEL_NAME
=
"BAAI/bge-base-en-v1.5"
...
...
tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
View file @
d9d21eb8
...
@@ -8,7 +8,7 @@ import torch.nn.functional as F
...
@@ -8,7 +8,7 @@ import torch.nn.functional as F
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.pooling.pooling.protocol
import
PoolingResponse
from
vllm.entrypoints.pooling.pooling.protocol
import
PoolingResponse
from
vllm.entrypoints.pooling.scor
e
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.entrypoints.pooling.scor
ing
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
MODEL_NAME
=
"BAAI/bge-reranker-base"
MODEL_NAME
=
"BAAI/bge-reranker-base"
...
...
tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
View file @
d9d21eb8
...
@@ -7,7 +7,7 @@ import pytest
...
@@ -7,7 +7,7 @@ import pytest
import
requests
import
requests
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
vllm.entrypoints.pooling.scor
e
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.entrypoints.pooling.scor
ing
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
from
vllm.multimodal.utils
import
encode_image_url
,
fetch_image
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
...
tests/entrypoints/pooling/scoring/test_late_interaction_offline_vision.py
0 → 100644
View file @
d9d21eb8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
weakref
import
pytest
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.platforms
import
current_platform
from
.util
import
make_base64_image
,
make_image_mm_param
MODEL_NAME
=
"vidore/colpali-v1.3-hf"
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config
=
None
if
current_platform
.
is_rocm
():
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
32768
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.75
,
enforce_eager
=
True
,
seed
=
0
,
attention_config
=
attention_config
,
)
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup_dist_env_and_memory
()
@
pytest
.
mark
.
skip_global_cleanup
def
test_query_text_vs_docs_image
(
llm
):
"""Score a text query against image documents via the multimodal path."""
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
blue_image
=
make_base64_image
(
64
,
64
,
color
=
(
0
,
0
,
255
))
query
=
"Describe the red object"
image_docs
=
[
make_image_mm_param
(
red_image
),
make_image_mm_param
(
blue_image
),
]
scores
=
llm
.
score
(
query
,
image_docs
)
assert
len
(
scores
)
==
2
assert
scores
[
0
].
outputs
.
score
>
scores
[
1
].
outputs
.
score
@
pytest
.
mark
.
skip_global_cleanup
def
test_query_text_vs_docs_mix
(
llm
)
->
None
:
"""Score a text query against a mix of text and image documents."""
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
query
=
"What is the capital of France?"
documents
:
list
=
[
"The capital of France is Paris."
,
make_image_mm_param
(
red_image
),
]
scores
=
llm
.
score
(
query
,
documents
)
assert
len
(
scores
)
==
2
assert
scores
[
0
].
outputs
.
score
>
scores
[
1
].
outputs
.
score
@
pytest
.
mark
.
skip_global_cleanup
def
test_query_image_vs_docs_text
(
llm
)
->
None
:
"""Score an image query against text documents."""
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
image_query
=
make_image_mm_param
(
red_image
,
text
=
"red color"
)
documents
=
[
"Describe the red object."
,
"The capital of France is Paris."
,
]
scores
=
llm
.
score
(
image_query
,
documents
)
assert
len
(
scores
)
==
2
assert
scores
[
0
].
outputs
.
score
>
scores
[
1
].
outputs
.
score
tests/entrypoints/pooling/scoring/test_late_interaction_online.py
View file @
d9d21eb8
...
@@ -6,7 +6,7 @@ import pytest
...
@@ -6,7 +6,7 @@ import pytest
import
requests
import
requests
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.pooling.scor
e
.protocol
import
RerankResponse
,
ScoreResponse
from
vllm.entrypoints.pooling.scor
ing
.protocol
import
RerankResponse
,
ScoreResponse
from
.util
import
ColBERTScoringHfRunner
from
.util
import
ColBERTScoringHfRunner
...
...
tests/entrypoints/pooling/scoring/test_late_interaction_online_vision.py
0 → 100644
View file @
d9d21eb8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
requests
from
tests.entrypoints.pooling.scoring.util
import
(
make_base64_image
,
make_image_mm_param
,
)
from
tests.utils
import
RemoteOpenAIServer
from
vllm.entrypoints.pooling.scoring.protocol
import
RerankResponse
,
ScoreResponse
MODEL_NAME
=
"vidore/colpali-v1.3-hf"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
with
RemoteOpenAIServer
(
MODEL_NAME
,
[])
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_score_api_query_text_vs_docs_image
(
server
:
RemoteOpenAIServer
):
query
=
"Describe the red object"
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
blue_image
=
make_base64_image
(
64
,
64
,
color
=
(
0
,
0
,
255
))
documents
=
[
make_image_mm_param
(
red_image
),
make_image_mm_param
(
blue_image
),
]
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
documents
,
},
)
score_response
.
raise_for_status
()
scores
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
scores
.
id
is
not
None
assert
scores
.
data
is
not
None
assert
len
(
scores
.
data
)
==
2
assert
scores
.
data
[
0
].
score
>
scores
.
data
[
1
].
score
@
pytest
.
mark
.
asyncio
async
def
test_score_api_query_text_vs_docs_mix
(
server
:
RemoteOpenAIServer
):
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
query
=
"What is the capital of France?"
documents
:
list
=
[
"The capital of France is Paris."
,
make_image_mm_param
(
red_image
),
]
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
query
,
"documents"
:
documents
,
},
)
score_response
.
raise_for_status
()
scores
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
scores
.
id
is
not
None
assert
scores
.
data
is
not
None
assert
len
(
scores
.
data
)
==
2
assert
scores
.
data
[
0
].
score
>
scores
.
data
[
1
].
score
@
pytest
.
mark
.
asyncio
async
def
test_score_api_query_image_vs_docs_text
(
server
:
RemoteOpenAIServer
):
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
image_query
=
make_image_mm_param
(
red_image
,
text
=
"red color"
)
documents
=
[
"Describe the red object."
,
"The capital of France is Paris."
,
]
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
{
"model"
:
MODEL_NAME
,
"queries"
:
image_query
,
"documents"
:
documents
,
},
)
score_response
.
raise_for_status
()
scores
=
ScoreResponse
.
model_validate
(
score_response
.
json
())
assert
scores
.
id
is
not
None
assert
scores
.
data
is
not
None
assert
len
(
scores
.
data
)
==
2
assert
scores
.
data
[
0
].
score
>
scores
.
data
[
1
].
score
@
pytest
.
mark
.
asyncio
async
def
test_rerank_api_query_text_vs_docs_image
(
server
:
RemoteOpenAIServer
):
query
=
"Describe the red object"
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
blue_image
=
make_base64_image
(
64
,
64
,
color
=
(
0
,
0
,
255
))
documents
=
[
make_image_mm_param
(
red_image
),
make_image_mm_param
(
blue_image
),
]
rerank_response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
json
=
{
"model"
:
MODEL_NAME
,
"query"
:
query
,
"documents"
:
documents
},
)
rerank_response
.
raise_for_status
()
rerank
=
RerankResponse
.
model_validate
(
rerank_response
.
json
())
assert
rerank
.
id
is
not
None
assert
rerank
.
results
is
not
None
assert
len
(
rerank
.
results
)
==
2
red_result
=
next
(
r
for
r
in
rerank
.
results
if
r
.
index
==
0
)
blue_result
=
next
(
r
for
r
in
rerank
.
results
if
r
.
index
==
1
)
assert
red_result
.
relevance_score
>
blue_result
.
relevance_score
@
pytest
.
mark
.
asyncio
async
def
test_rerank_api_query_text_vs_docs_mix
(
server
:
RemoteOpenAIServer
):
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
query
=
"What is the capital of France?"
documents
:
list
=
[
"The capital of France is Paris."
,
make_image_mm_param
(
red_image
),
]
rerank_response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
json
=
{
"model"
:
MODEL_NAME
,
"query"
:
query
,
"documents"
:
documents
,
},
)
rerank_response
.
raise_for_status
()
rerank
=
RerankResponse
.
model_validate
(
rerank_response
.
json
())
assert
rerank
.
id
is
not
None
assert
rerank
.
results
is
not
None
assert
len
(
rerank
.
results
)
==
2
result0
=
next
(
r
for
r
in
rerank
.
results
if
r
.
index
==
0
)
result1
=
next
(
r
for
r
in
rerank
.
results
if
r
.
index
==
1
)
assert
result0
.
relevance_score
>
result1
.
relevance_score
@
pytest
.
mark
.
asyncio
async
def
test_rerank_api_query_image_vs_docs_text
(
server
:
RemoteOpenAIServer
):
red_image
=
make_base64_image
(
64
,
64
,
color
=
(
255
,
0
,
0
))
image_query
=
make_image_mm_param
(
red_image
,
text
=
"red color"
)
documents
=
[
"Describe the red object."
,
"The capital of France is Paris."
,
]
rerank_response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
json
=
{
"model"
:
MODEL_NAME
,
"query"
:
image_query
,
"documents"
:
documents
,
},
)
rerank_response
.
raise_for_status
()
rerank
=
RerankResponse
.
model_validate
(
rerank_response
.
json
())
assert
rerank
.
id
is
not
None
assert
rerank
.
results
is
not
None
assert
len
(
rerank
.
results
)
==
2
result0
=
next
(
r
for
r
in
rerank
.
results
if
r
.
index
==
0
)
result1
=
next
(
r
for
r
in
rerank
.
results
if
r
.
index
==
1
)
assert
result0
.
relevance_score
>
result1
.
relevance_score
tests/entrypoints/pooling/scoring/test_utils.py
deleted
100644 → 0
View file @
f09daea2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
unittest.mock
import
patch
import
pytest
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.chat_utils
import
ChatTemplateResolutionError
from
vllm.entrypoints.pooling.score.utils
import
(
get_score_prompt
,
)
from
vllm.inputs
import
TokensPrompt
from
vllm.tokenizers
import
get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID
=
"cross-encoder/ms-marco-MiniLM-L-6-v2"
def
assert_prompt_tokenization_consistent
(
tokenizer
,
full_prompt
,
engine_prompt
,
add_special_tokens
=
True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids
=
tokenizer
(
full_prompt
,
add_special_tokens
=
add_special_tokens
)[
"input_ids"
]
actual_ids
=
engine_prompt
[
"prompt_token_ids"
]
assert
actual_ids
==
expected_ids
,
(
f
"Token IDs don't match.
\n
Expected:
{
expected_ids
}
\n
Actual:
{
actual_ids
}
"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
cross_encoder_model_config
():
return
ModelConfig
(
CROSS_ENCODER_MODEL_ID
,
runner
=
"pooling"
,
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
cross_encoder_tokenizer
(
cross_encoder_model_config
):
return
get_tokenizer
(
CROSS_ENCODER_MODEL_ID
,
trust_remote_code
=
cross_encoder_model_config
.
trust_remote_code
,
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm_reranker_model_config
():
"""Model config for LLM-as-reranker style (no pad token)."""
config
=
ModelConfig
(
CROSS_ENCODER_MODEL_ID
,
runner
=
"pooling"
,
)
# use_sep_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config
.
hf_config
.
use_sep_token
=
False
return
config
@
pytest
.
fixture
def
tokenization_kwargs
():
"""Common tokenization kwargs used across tests."""
return
{
"add_special_tokens"
:
True
,
"return_tensors"
:
None
}
@
pytest
.
fixture
def
mock_model_with_score_template
():
"""Mock model class that supports score template and tracks post_process calls."""
class
MockModelWithScoreTemplate
:
supports_score_template
=
True
post_process_called
:
list
[
TokensPrompt
]
=
[]
@
staticmethod
def
get_score_template
(
p1
:
str
,
p2
:
str
)
->
str
:
return
f
"[QUERY]
{
p1
}
[SEP][DOC]
{
p2
}
"
@
staticmethod
def
post_process_tokens
(
prompt
:
TokensPrompt
)
->
None
:
MockModelWithScoreTemplate
.
post_process_called
.
append
(
prompt
)
return
MockModelWithScoreTemplate
@
pytest
.
fixture
def
mock_model_no_score_template
():
"""Mock model class that does not support score template."""
class
MockModelNoScoreTemplate
:
supports_score_template
=
False
return
MockModelNoScoreTemplate
class
TestGetScorePrompt
:
"""Tests for the get_score_prompt function."""
def
test_tokenization_kwargs_passed_through
(
self
,
llm_reranker_model_config
,
cross_encoder_tokenizer
,
):
"""Test that tokenization kwargs are properly passed through."""
data_1
=
"Query text"
data_2
=
"Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs
=
{
"add_special_tokens"
:
True
,
"return_tensors"
:
None
,
"truncation"
:
True
,
"max_length"
:
20
,
}
full_prompt
,
engine_prompt
=
get_score_prompt
(
llm_reranker_model_config
,
cross_encoder_tokenizer
,
custom_tokenization_kwargs
,
data_1
,
data_2
,
)
assert
isinstance
(
full_prompt
,
str
)
assert
"prompt_token_ids"
in
engine_prompt
# With max_length=20 and truncation, should not exceed this
assert
len
(
engine_prompt
[
"prompt_token_ids"
])
<=
20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids
=
cross_encoder_tokenizer
(
full_prompt
,
add_special_tokens
=
True
)[
"input_ids"
]
actual_ids
=
engine_prompt
[
"prompt_token_ids"
]
assert
full_ids
[:
len
(
actual_ids
)]
==
actual_ids
,
(
f
"Token IDs are not a prefix of full encoding.
\n
"
f
"Full IDs:
{
full_ids
}
\n
"
f
"Actual IDs:
{
actual_ids
}
"
)
def
test_model_supports_score_template
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_with_score_template
,
):
"""Test when model supports score template (no score_template arg)."""
with
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_with_score_template
,
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query text"
,
"document text"
,
)
assert
full_prompt
==
"[QUERY]query text[SEP][DOC]document text"
assert
"prompt_token_ids"
in
engine_prompt
assert
len
(
engine_prompt
[
"prompt_token_ids"
])
>
0
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_model_supports_score_template_but_custom_template_provided
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_with_score_template
,
):
"""Test when model supports score template but custom template is provided."""
template
=
(
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_with_score_template
,
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"doc"
,
score_template
=
template
,
# Providing a template
)
assert
"prompt_token_ids"
in
engine_prompt
assert
full_prompt
==
"TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_not_using_default_template
(
self
,
llm_reranker_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_no_score_template
,
):
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_no_score_template
,
),
patch
(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template"
,
return_value
=
"test querytest doc"
,
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
llm_reranker_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"test query"
,
"test doc"
,
)
assert
full_prompt
==
"test querytest doc"
assert
"prompt_token_ids"
in
engine_prompt
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_fallback_with_sep_token
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_no_score_template
,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=True."""
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_no_score_template
,
),
patch
(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template"
,
side_effect
=
ChatTemplateResolutionError
(
"No template"
),
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
# use_sep_token=True
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"document"
,
)
assert
"prompt_token_ids"
in
engine_prompt
# Should have token_type_ids from text_pair encoding
assert
"token_type_ids"
in
engine_prompt
assert
"query"
in
full_prompt
assert
"document"
in
full_prompt
assert
full_prompt
!=
"querydocument"
assert
(
engine_prompt
[
"prompt_token_ids"
]
==
cross_encoder_tokenizer
(
"query"
,
text_pair
=
"document"
,
add_special_tokens
=
True
)[
"input_ids"
]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
,
add_special_tokens
=
False
,
)
def
test_fallback_without_sep_token
(
self
,
llm_reranker_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_no_score_template
,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=False."""
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_no_score_template
,
),
patch
(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template"
,
side_effect
=
ChatTemplateResolutionError
(
"No template"
),
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
llm_reranker_model_config
,
# use_sep_token=False
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"document"
,
)
assert
full_prompt
==
"querydocument"
assert
"prompt_token_ids"
in
engine_prompt
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_post_process_tokens_called
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_with_score_template
,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template
.
post_process_called
.
clear
()
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_with_score_template
,
),
patch
(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template"
,
side_effect
=
ChatTemplateResolutionError
(
"No template"
),
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"doc"
,
)
# post_process_tokens should have been called once
assert
len
(
mock_model_with_score_template
.
post_process_called
)
==
1
assert
mock_model_with_score_template
.
post_process_called
[
0
]
is
engine_prompt
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
tests/entrypoints/pooling/scoring/util.py
View file @
d9d21eb8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
io
import
BytesIO
import
pybase64
as
base64
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
huggingface_hub
import
hf_hub_download
from
huggingface_hub
import
hf_hub_download
from
PIL
import
Image
from
safetensors.torch
import
load_file
from
safetensors.torch
import
load_file
from
transformers
import
AutoModel
,
AutoTokenizer
from
transformers
import
AutoModel
,
AutoTokenizer
from
tests.conftest
import
HfRunner
from
tests.conftest
import
HfRunner
from
vllm.entrypoints.pooling.score.utils
import
compute_maxsim_score
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
,
)
from
vllm.entrypoints.pooling.scoring.typing
import
ScoreMultiModalParam
from
vllm.entrypoints.pooling.scoring.utils
import
compute_maxsim_score
class
ColBERTScoringHfRunner
(
torch
.
nn
.
Module
):
class
ColBERTScoringHfRunner
(
torch
.
nn
.
Module
):
...
@@ -67,3 +76,32 @@ class EncoderScoringHfRunner(HfRunner):
...
@@ -67,3 +76,32 @@ class EncoderScoringHfRunner(HfRunner):
for
pair
in
hf_embeddings
for
pair
in
hf_embeddings
]
]
return
torch
.
as_tensor
(
hf_outputs
)
return
torch
.
as_tensor
(
hf_outputs
)
def
make_base64_image
(
width
:
int
=
64
,
height
:
int
=
64
,
color
:
tuple
[
int
,
int
,
int
]
=
(
255
,
0
,
0
)
)
->
str
:
"""Create a small solid-color PNG image and return its base64 data URI."""
img
=
Image
.
new
(
"RGB"
,
(
width
,
height
),
color
)
buf
=
BytesIO
()
img
.
save
(
buf
,
format
=
"PNG"
)
b64
=
base64
.
b64encode
(
buf
.
getvalue
()).
decode
()
return
f
"data:image/png;base64,
{
b64
}
"
def
make_image_mm_param
(
image_uri
:
str
,
text
:
str
|
None
=
None
,
)
->
ScoreMultiModalParam
:
"""Build a ScoreMultiModalParam containing an image (and optional text)."""
content
:
list
=
[
ChatCompletionContentPartImageParam
(
type
=
"image_url"
,
image_url
=
{
"url"
:
image_uri
},
),
]
if
text
is
not
None
:
content
.
append
(
ChatCompletionContentPartTextParam
(
type
=
"text"
,
text
=
text
),
)
return
ScoreMultiModalParam
(
content
=
content
)
tests/entrypoints/pooling/token_classify/test_offline.py
View file @
d9d21eb8
...
@@ -60,7 +60,7 @@ def test_token_ids_prompts(llm: LLM):
...
@@ -60,7 +60,7 @@ def test_token_ids_prompts(llm: LLM):
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_score_api
(
llm
:
LLM
):
def
test_score_api
(
llm
:
LLM
):
err_msg
=
"Scor
e
API is only enabled for num_labels == 1."
err_msg
=
"Scor
ing
API is only enabled for num_labels == 1."
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
with
pytest
.
raises
(
ValueError
,
match
=
err_msg
):
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
llm
.
score
(
"ping"
,
"pong"
,
use_tqdm
=
False
)
...
...
tests/models/language/pooling/test_colbert.py
View file @
d9d21eb8
...
@@ -9,7 +9,7 @@ generic ColBERT support works with different encoder architectures.
...
@@ -9,7 +9,7 @@ generic ColBERT support works with different encoder architectures.
import
pytest
import
pytest
import
torch
import
torch
from
vllm.entrypoints.pooling.scor
e
.utils
import
compute_maxsim_score
from
vllm.entrypoints.pooling.scor
ing
.utils
import
compute_maxsim_score
# -----------------------------------------------------------------------
# -----------------------------------------------------------------------
# Model definitions: (model_name, colbert_dim, extra vllm_runner kwargs)
# Model definitions: (model_name, colbert_dim, extra vllm_runner kwargs)
...
...
tests/models/multimodal/pooling/test_colmodernvbert.py
View file @
d9d21eb8
...
@@ -10,7 +10,7 @@ embeddings for visual document retrieval.
...
@@ -10,7 +10,7 @@ embeddings for visual document retrieval.
import
pytest
import
pytest
import
torch
import
torch
from
vllm.entrypoints.pooling.scor
e
.utils
import
compute_maxsim_score
from
vllm.entrypoints.pooling.scor
ing
.utils
import
compute_maxsim_score
MODEL_NAME
=
"ModernVBERT/colmodernvbert-merged"
MODEL_NAME
=
"ModernVBERT/colmodernvbert-merged"
COLBERT_DIM
=
128
COLBERT_DIM
=
128
...
...
tests/models/multimodal/pooling/test_colpali.py
View file @
d9d21eb8
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
,
ChatCompletionContentPartTextParam
,
)
)
from
vllm.entrypoints.pooling.scor
e.utils
import
ScoreMultiModalParam
from
vllm.entrypoints.pooling.scor
ing.typing
import
ScoreMultiModalParam
from
....conftest
import
VllmRunner
from
....conftest
import
VllmRunner
...
@@ -114,7 +114,7 @@ def _run_late_interaction_test(
...
@@ -114,7 +114,7 @@ def _run_late_interaction_test(
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
"""Verify MaxSim scoring matches manual computation."""
"""Verify MaxSim scoring matches manual computation."""
from
vllm.entrypoints.pooling.scor
e
.utils
import
compute_maxsim_score
from
vllm.entrypoints.pooling.scor
ing
.utils
import
compute_maxsim_score
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
...
...
tests/models/multimodal/pooling/test_colqwen3.py
View file @
d9d21eb8
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
,
ChatCompletionContentPartTextParam
,
)
)
from
vllm.entrypoints.pooling.scor
e.utils
import
ScoreMultiModalParam
from
vllm.entrypoints.pooling.scor
ing.typing
import
ScoreMultiModalParam
from
....conftest
import
VllmRunner
from
....conftest
import
VllmRunner
...
@@ -125,7 +125,7 @@ def _run_late_interaction_test(
...
@@ -125,7 +125,7 @@ def _run_late_interaction_test(
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
"""Verify MaxSim scoring matches manual computation."""
"""Verify MaxSim scoring matches manual computation."""
from
vllm.entrypoints.pooling.scor
e
.utils
import
compute_maxsim_score
from
vllm.entrypoints.pooling.scor
ing
.utils
import
compute_maxsim_score
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
...
...
tests/models/multimodal/pooling/test_colqwen3_5.py
View file @
d9d21eb8
...
@@ -73,7 +73,7 @@ def _run_late_interaction_test(
...
@@ -73,7 +73,7 @@ def _run_late_interaction_test(
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
"""Verify MaxSim scoring matches manual computation."""
"""Verify MaxSim scoring matches manual computation."""
from
vllm.entrypoints.pooling.scor
e
.utils
import
compute_maxsim_score
from
vllm.entrypoints.pooling.scor
ing
.utils
import
compute_maxsim_score
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
...
...
tests/models/multimodal/pooling/test_jinavl_reranker.py
View file @
d9d21eb8
...
@@ -11,7 +11,7 @@ from vllm.entrypoints.chat_utils import (
...
@@ -11,7 +11,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
,
ChatCompletionContentPartTextParam
,
)
)
from
vllm.entrypoints.pooling.scor
e.utils
import
ScoreMultiModalParam
from
vllm.entrypoints.pooling.scor
ing.typing
import
ScoreMultiModalParam
from
....conftest
import
HfRunner
,
VllmRunner
from
....conftest
import
HfRunner
,
VllmRunner
...
...
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
View file @
d9d21eb8
...
@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (
...
@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
,
ChatCompletionContentPartTextParam
,
)
)
from
vllm.entrypoints.pooling.scor
e.utils
import
ScoreMultiModalParam
from
vllm.entrypoints.pooling.scor
ing.typing
import
ScoreMultiModalParam
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
...
...
vllm/entrypoints/llm.py
View file @
d9d21eb8
...
@@ -46,22 +46,16 @@ from vllm.entrypoints.chat_utils import (
...
@@ -46,22 +46,16 @@ from vllm.entrypoints.chat_utils import (
load_chat_template
,
load_chat_template
,
)
)
from
vllm.entrypoints.pooling.io_processor_factories
import
init_pooling_io_processors
from
vllm.entrypoints.pooling.io_processor_factories
import
init_pooling_io_processors
from
vllm.entrypoints.pooling.score.utils
import
(
from
vllm.entrypoints.pooling.scoring.io_processor
import
(
ScoreData
,
ScoringIOProcessor
,
ScoreMultiModalParam
,
_cosine_similarity
,
compress_token_type_ids
,
compute_maxsim_score
,
get_score_prompt
,
score_data_to_prompts
,
validate_score_input
,
)
)
from
vllm.entrypoints.pooling.scoring.typing
import
ScoreInput
from
vllm.entrypoints.pooling.typing
import
OfflineInputsContext
,
OfflineOutputsContext
from
vllm.entrypoints.utils
import
log_non_default_args
from
vllm.entrypoints.utils
import
log_non_default_args
from
vllm.inputs
import
(
from
vllm.inputs
import
(
DataPrompt
,
DataPrompt
,
EngineInput
,
EngineInput
,
PromptType
,
PromptType
,
SingletonPrompt
,
TextPrompt
,
TextPrompt
,
TokensPrompt
,
TokensPrompt
,
)
)
...
@@ -1161,7 +1155,9 @@ class LLM:
...
@@ -1161,7 +1155,9 @@ class LLM:
if
pooling_task
in
self
.
pooling_io_processors
:
if
pooling_task
in
self
.
pooling_io_processors
:
io_processor
=
self
.
pooling_io_processors
[
pooling_task
]
io_processor
=
self
.
pooling_io_processors
[
pooling_task
]
processor_inputs
=
io_processor
.
pre_process_offline
(
processor_inputs
=
io_processor
.
pre_process_offline
(
prompts_seq
,
tokenization_kwargs
ctx
=
OfflineInputsContext
(
prompts
=
prompts_seq
,
tokenization_kwargs
=
tokenization_kwargs
)
)
)
seq_lora_requests
=
self
.
_lora_request_to_seq
(
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
prompts_seq
)
lora_request
,
len
(
prompts_seq
)
...
@@ -1178,7 +1174,9 @@ class LLM:
...
@@ -1178,7 +1174,9 @@ class LLM:
outputs
=
self
.
_run_engine
(
outputs
=
self
.
_run_engine
(
use_tqdm
=
use_tqdm
,
output_type
=
PoolingRequestOutput
use_tqdm
=
use_tqdm
,
output_type
=
PoolingRequestOutput
)
)
outputs
=
io_processor
.
post_process_offline
(
outputs
)
outputs
=
io_processor
.
post_process_offline
(
ctx
=
OfflineOutputsContext
(
outputs
=
outputs
)
)
else
:
else
:
outputs
=
self
.
_run_completion
(
outputs
=
self
.
_run_completion
(
prompts
=
prompts_seq
,
prompts
=
prompts_seq
,
...
@@ -1378,188 +1376,10 @@ class LLM:
...
@@ -1378,188 +1376,10 @@ class LLM:
tokenization_kwargs
=
tokenization_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
)
)
def
_embedding_score
(
self
,
data_1
:
list
[
ScoreData
],
data_2
:
list
[
ScoreData
],
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
],
pooling_params
:
PoolingParams
|
None
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
,
tokenization_kwargs
:
dict
[
str
,
Any
],
)
->
list
[
ScoringRequestOutput
]:
tokenizer
=
self
.
get_tokenizer
()
input_texts
:
list
[
str
]
=
[]
for
text
in
data_1
+
data_2
:
if
not
isinstance
(
text
,
str
):
raise
NotImplementedError
(
"Embedding scores currently do not support multimodal input."
)
input_texts
.
append
(
text
)
encoded_output
=
self
.
encode
(
input_texts
,
use_tqdm
=
use_tqdm
,
lora_request
=
lora_request
,
pooling_params
=
pooling_params
,
pooling_task
=
"embed"
,
tokenization_kwargs
=
tokenization_kwargs
,
)
encoded_output_1
=
encoded_output
[
0
:
len
(
data_1
)]
encoded_output_2
=
encoded_output
[
len
(
data_1
)
:]
if
len
(
encoded_output_1
)
==
1
:
encoded_output_1
=
encoded_output_1
*
len
(
encoded_output_2
)
scores
=
_cosine_similarity
(
tokenizer
=
tokenizer
,
embed_1
=
encoded_output_1
,
embed_2
=
encoded_output_2
,
)
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
scores
]
def
_late_interaction_score
(
self
,
data_1
:
list
[
ScoreData
],
data_2
:
list
[
ScoreData
],
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
],
pooling_params
:
PoolingParams
|
None
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
,
tokenization_kwargs
:
dict
[
str
,
Any
],
)
->
list
[
ScoringRequestOutput
]:
"""
Late interaction scoring (ColBERT MaxSim).
Encodes queries and documents into per-token embeddings, then computes
MaxSim: sum over query tokens of max similarity to any document token.
"""
from
vllm.outputs
import
PoolingOutput
tokenizer
=
self
.
get_tokenizer
()
# Convert ScoreData to PromptType (handles both text and multimodal)
model_config
=
self
.
model_config
prompts_1
=
score_data_to_prompts
(
data_1
,
"query"
,
model_config
)
prompts_2
=
score_data_to_prompts
(
data_2
,
"document"
,
model_config
)
encoded_output
:
list
[
PoolingRequestOutput
]
=
self
.
encode
(
prompts_1
+
prompts_2
,
use_tqdm
=
use_tqdm
,
lora_request
=
lora_request
,
pooling_params
=
pooling_params
,
pooling_task
=
"token_embed"
,
tokenization_kwargs
=
tokenization_kwargs
,
)
encoded_output_1
:
list
[
PoolingRequestOutput
]
=
encoded_output
[:
len
(
prompts_1
)]
encoded_output_2
:
list
[
PoolingRequestOutput
]
=
encoded_output
[
len
(
prompts_1
)
:]
if
len
(
encoded_output_1
)
==
1
:
encoded_output_1
=
encoded_output_1
*
len
(
encoded_output_2
)
# Compute MaxSim scores
scores
:
list
[
PoolingRequestOutput
]
=
[]
padding
:
list
[
int
]
=
[]
if
(
pad_token_id
:
=
tokenizer
.
pad_token_id
)
is
not
None
:
padding
=
[
pad_token_id
]
for
emb_1
,
emb_2
in
zip
(
encoded_output_1
,
encoded_output_2
):
# emb_1.outputs.data: [query_len, dim]
# emb_2.outputs.data: [doc_len, dim]
q_emb
=
emb_1
.
outputs
.
data
d_emb
=
emb_2
.
outputs
.
data
maxsim_score
=
compute_maxsim_score
(
q_emb
,
d_emb
)
tokens
=
emb_1
.
prompt_token_ids
+
padding
+
emb_2
.
prompt_token_ids
scores
.
append
(
PoolingRequestOutput
(
request_id
=
f
"
{
emb_1
.
request_id
}
_
{
emb_2
.
request_id
}
"
,
outputs
=
PoolingOutput
(
data
=
maxsim_score
),
prompt_token_ids
=
tokens
,
num_cached_tokens
=
emb_1
.
num_cached_tokens
+
emb_2
.
num_cached_tokens
,
finished
=
True
,
)
)
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
scores
]
def
_cross_encoding_score
(
self
,
data_1
:
list
[
ScoreData
],
data_2
:
list
[
ScoreData
],
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
],
pooling_params
:
PoolingParams
|
None
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
,
tokenization_kwargs
:
dict
[
str
,
Any
],
score_template
:
str
|
None
,
)
->
list
[
ScoringRequestOutput
]:
model_config
=
self
.
model_config
tokenizer
=
self
.
get_tokenizer
()
if
is_mistral_tokenizer
(
tokenizer
):
raise
ValueError
(
"Score API is not supported for Mistral tokenizer"
)
if
len
(
data_1
)
==
1
:
data_1
=
data_1
*
len
(
data_2
)
if
pooling_params
is
None
:
pooling_params
=
PoolingParams
(
task
=
"classify"
)
elif
pooling_params
.
task
is
None
:
pooling_params
.
task
=
"classify"
pooling_params_list
=
list
[
PoolingParams
]()
prompts
=
list
[
PromptType
]()
input_pairs
=
[(
t1
,
t2
)
for
t1
,
t2
in
zip
(
data_1
,
data_2
)]
for
q
,
d
in
input_pairs
:
_
,
engine_prompt
=
get_score_prompt
(
model_config
=
model_config
,
data_1
=
q
,
data_2
=
d
,
tokenizer
=
tokenizer
,
tokenization_kwargs
=
tokenization_kwargs
,
score_template
=
score_template
,
)
if
token_type_ids
:
=
engine_prompt
.
pop
(
"token_type_ids"
,
None
):
params
=
pooling_params
.
clone
()
compressed
=
compress_token_type_ids
(
token_type_ids
)
params
.
extra_kwargs
=
{
"compressed_token_type_ids"
:
compressed
}
pooling_params_list
.
append
(
params
)
else
:
pooling_params_list
.
append
(
pooling_params
)
prompts
.
append
(
engine_prompt
)
outputs
=
self
.
_run_completion
(
prompts
=
prompts
,
params
=
pooling_params_list
,
output_type
=
PoolingRequestOutput
,
use_tqdm
=
use_tqdm
,
lora_request
=
lora_request
,
)
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
outputs
]
def
score
(
def
score
(
self
,
self
,
data_1
:
SingletonPrompt
data_1
:
ScoreInput
|
list
[
ScoreInput
],
|
Sequence
[
SingletonPrompt
]
data_2
:
ScoreInput
|
list
[
ScoreInput
],
|
ScoreMultiModalParam
|
list
[
ScoreMultiModalParam
],
data_2
:
SingletonPrompt
|
Sequence
[
SingletonPrompt
]
|
ScoreMultiModalParam
|
list
[
ScoreMultiModalParam
],
/
,
/
,
*
,
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
...
@@ -1606,83 +1426,71 @@ class LLM:
...
@@ -1606,83 +1426,71 @@ class LLM:
A list of `ScoringRequestOutput` objects containing the
A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts.
generated scores in the same order as the input prompts.
"""
"""
model_config
=
self
.
model_config
runner_type
=
model_config
.
runner_type
if
self
.
runner_type
!=
"pooling"
:
if
runner_type
!=
"pooling"
:
raise
ValueError
(
raise
ValueError
(
"LLM.score() is only supported for pooling models. "
"LLM.score() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a "
"Try passing `--runner pooling` to use the model as a "
"pooling model."
"pooling model."
)
)
supported_tasks
=
self
.
supported_tasks
score_type
=
self
.
model_config
.
score_type
score_type
=
self
.
model_config
.
score_type
is_late_interaction
=
score_type
==
"late-interaction"
if
(
is_cross_encoder
=
score_type
==
"cross-encoder"
score_type
==
"cross-encoder"
and
getattr
(
self
.
model_config
.
hf_config
,
"num_labels"
,
0
)
!=
1
# Late interaction models (e.g., ColBERT) use token_embed for scoring
if
not
is_late_interaction
and
all
(
t
not
in
supported_tasks
for
t
in
(
"embed"
,
"classify"
)
):
):
raise
ValueError
(
raise
ValueError
(
"Scoring API is only enabled for num_labels == 1."
)
"Score API is not supported by this model. "
"Try converting the model using "
"`--convert embed` or `--convert classify`."
)
if
is_cross_encoder
and
getattr
(
model_config
.
hf_config
,
"num_labels"
,
0
)
!=
1
:
if
score_type
is
None
or
score_type
not
in
self
.
pooling_io_processors
:
raise
ValueError
(
"
Score API is only enabled for num_labels == 1
."
)
raise
ValueError
(
"
This model does not support the Scoring API
."
)
if
not
is_cross_encoder
and
chat_template
is
not
None
:
io_processor
=
self
.
pooling_io_processors
[
score_type
]
raise
ValueError
(
assert
isinstance
(
io_processor
,
ScoringIOProcessor
)
"chat_template is only supported for cross-encoder models."
)
is_multimodal_model
=
model_config
.
is_multimodal_model
pooling_task
=
io_processor
.
pooling_task
architecture
=
model_config
.
architecture
scoring_data
=
io_processor
.
valid_inputs
(
data_1
,
data_2
)
offset
=
len
(
scoring_data
.
data_1
)
score_data_1
,
score_data_2
=
validate_score_input
(
ctx
=
OfflineInputsContext
(
data_1
,
# type: ignore[arg-type]
prompts
=
scoring_data
,
data_2
,
# type: ignore[arg-type]
pooling_params
=
pooling_params
,
is_multimodal_model
=
is_multimodal_model
,
tokenization_kwargs
=
tokenization_kwargs
,
architecture
=
architecture
,
chat_template
=
chat_template
,
offset
=
offset
,
)
)
renderer
=
self
.
renderer
processor_inputs
=
io_processor
.
pre_process_offline
(
ctx
)
tok_params
=
renderer
.
default_cmpl_tok_params
.
with_kwargs
(
**
(
tokenization_kwargs
or
{})
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
processor_inputs
)
)
)
encode_kwargs
=
tok_params
.
get_encode_kwargs
()
if
is_cross_encoder
:
if
ctx
.
pooling_params
is
None
:
return
self
.
_cross_encoding_score
(
ctx
.
pooling_params
=
PoolingParams
()
score_data_1
,
params_seq
=
self
.
_params_to_seq
(
ctx
.
pooling_params
,
len
(
processor_inputs
))
score_data_2
,
use_tqdm
=
use_tqdm
,
for
param
in
params_seq
:
pooling_params
=
pooling_params
,
if
param
.
task
is
None
:
lora_request
=
lora_request
,
param
.
task
=
pooling_task
tokenization_kwargs
=
encode_kwargs
,
elif
param
.
task
!=
pooling_task
:
score_template
=
chat_template
,
msg
=
f
"You cannot overwrite
{
param
.
task
=
!
r
}
with
{
pooling_task
=
!
r
}
!"
)
raise
ValueError
(
msg
)
elif
is_late_interaction
:
return
self
.
_late_interaction_score
(
seq_priority
=
self
.
_priority_to_seq
(
None
,
len
(
processor_inputs
))
score_data_1
,
score_data_2
,
self
.
_render_and_add_requests
(
use_tqdm
=
use_tqdm
,
prompts
=
processor_inputs
,
pooling_params
=
pooling_params
,
params
=
params_seq
,
lora_request
=
lora_request
,
lora_requests
=
seq_lora_requests
,
tokenization_kwargs
=
encode_kwargs
,
priorities
=
seq_priority
,
)
)
else
:
return
self
.
_embedding_score
(
outputs
=
self
.
_run_engine
(
use_tqdm
=
use_tqdm
,
output_type
=
PoolingRequestOutput
)
score_data_1
,
outputs
=
io_processor
.
post_process_offline
(
score_data_2
,
ctx
=
OfflineOutputsContext
(
outputs
=
outputs
,
offset
=
offset
),
use_tqdm
=
use_tqdm
,
)
pooling_params
=
pooling_params
,
lora_request
=
lora_request
,
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
outputs
]
tokenization_kwargs
=
encode_kwargs
,
)
def
start_profile
(
self
,
profile_prefix
:
str
|
None
=
None
)
->
None
:
def
start_profile
(
self
,
profile_prefix
:
str
|
None
=
None
)
->
None
:
"""Start profiling with optional custom trace prefix.
"""Start profiling with optional custom trace prefix.
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment