"vscode:/vscode.git/clone" did not exist on "6f88f762bf907bccdadc52948001b21ccb616a01"
Unverified Commit 1b8fe6f7 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend][4/n] Make pooling entrypoints request schema consensus | ScoreRequest (#33060)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
parent 52ee2102
...@@ -89,6 +89,29 @@ def main(args): ...@@ -89,6 +89,29 @@ def main(args):
response = requests.post(rerank_url, json=prompt) response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json()) pprint.pprint(response.json())
print("Query: string & Document: text + image url")
prompt = {
"model": model,
"query": query,
"documents": {"content": [documents[0], documents[1]]},
}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: list")
prompt = {
"model": model,
"query": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
......
...@@ -92,6 +92,44 @@ def main(args): ...@@ -92,6 +92,44 @@ def main(args):
response = requests.post(score_url, json=prompt) response = requests.post(score_url, json=prompt)
pprint.pprint(response.json()) pprint.pprint(response.json())
print("Query: string & Document: text + image url")
prompt = {
"model": model,
"queries": query,
"documents": {"content": [documents[0], documents[1]]},
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: list")
prompt = {
"model": model,
"queries": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: list & Document: list")
data = [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
]
prompt = {
"model": model,
"queries": data,
"documents": data,
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
......
...@@ -90,6 +90,35 @@ class TestModel: ...@@ -90,6 +90,35 @@ class TestModel:
for i in range(len(vllm_outputs)): for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_queries_str_items_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
items = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"items": items,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries, items]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str( def test_text_1_str_text_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner self, server: RemoteOpenAIServer, model: dict[str, Any], runner
): ):
......
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
import requests import requests
from tests.utils import VLLM_PATH, RemoteOpenAIServer from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.multimodal.utils import encode_image_url, fetch_image
MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B" MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
...@@ -16,11 +16,12 @@ HF_OVERRIDES = { ...@@ -16,11 +16,12 @@ HF_OVERRIDES = {
} }
query = "A cat standing in the snow." query = "A cat standing in the snow."
document = "This product was excellent and exceeded my expectations."
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
documents = [ documents = [
{ {
"type": "text", "type": "text",
"text": query, "text": document,
}, },
{ {
"type": "image_url", "type": "image_url",
...@@ -32,6 +33,11 @@ documents = [ ...@@ -32,6 +33,11 @@ documents = [
}, },
] ]
TEXT_VS_TEXT = 0.10040374100208282
TEXT_VS_IMAGE = 0.7423753142356873
TEXT_VS_TEXT_PLUS_IMAGE = 0.5298863053321838
TOL = 0.05
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
...@@ -50,15 +56,12 @@ def server(): ...@@ -50,15 +56,12 @@ def server():
def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer): def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
score_response = requests.post( score_response = requests.post(
server.url_for("score"), server.url_for("score"),
json={ json={
"model": MODEL_NAME, "model": MODEL_NAME,
"queries": queries, "queries": query,
"documents": documents, "documents": document,
}, },
) )
score_response.raise_for_status() score_response.raise_for_status()
...@@ -67,6 +70,8 @@ def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer): ...@@ -67,6 +70,8 @@ def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
assert score.id is not None assert score.id is not None
assert score.data is not None assert score.data is not None
assert len(score.data) == 1 assert len(score.data) == 1
assert score.usage.prompt_tokens == 81
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer): def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
...@@ -84,6 +89,8 @@ def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer ...@@ -84,6 +89,8 @@ def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer
assert score.id is not None assert score.id is not None
assert score.data is not None assert score.data is not None
assert len(score.data) == 1 assert len(score.data) == 1
assert score.usage.prompt_tokens == 81
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer): def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
...@@ -101,6 +108,8 @@ def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIS ...@@ -101,6 +108,8 @@ def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIS
assert score.id is not None assert score.id is not None
assert score.data is not None assert score.data is not None
assert len(score.data) == 1 assert len(score.data) == 1
assert score.usage.prompt_tokens == 98
assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
def test_score_api_queries_str_documents_image_base64_content( def test_score_api_queries_str_documents_image_base64_content(
...@@ -120,3 +129,111 @@ def test_score_api_queries_str_documents_image_base64_content( ...@@ -120,3 +129,111 @@ def test_score_api_queries_str_documents_image_base64_content(
assert score.id is not None assert score.id is not None
assert score.data is not None assert score.data is not None
assert len(score.data) == 1 assert len(score.data) == 1
assert score.usage.prompt_tokens == 98
assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
def test_score_api_queries_str_documents_image_url_plus_text_content(
server: RemoteOpenAIServer,
):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": {"content": [documents[0], documents[1]]},
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.usage.prompt_tokens == 108
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
def test_score_api_queries_str_documents_list(server: RemoteOpenAIServer):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 4
assert score.usage.prompt_tokens == 368
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
def test_rerank_api_queries_str_documents_list(server: RemoteOpenAIServer):
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.model is not None
assert rerank.usage is not None
assert len(rerank.results) == 4
rerank.results.sort(key=lambda x: x.index)
assert rerank.results[0].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert rerank.results[1].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert rerank.results[2].relevance_score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
assert rerank.results[3].relevance_score == pytest.approx(
TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL
)
def test_score_api_queries_list_documents_list(server: RemoteOpenAIServer):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": [query] * 4,
"documents": [
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 4
assert score.usage.prompt_tokens == 368
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
...@@ -40,12 +40,12 @@ from vllm.entrypoints.chat_utils import ( ...@@ -40,12 +40,12 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
) )
from vllm.entrypoints.pooling.score.utils import ( from vllm.entrypoints.pooling.score.utils import (
ScoreContentPartParam, ScoreData,
ScoreMultiModalParam, ScoreMultiModalParam,
_cosine_similarity, _cosine_similarity,
_validate_score_input_lens,
compress_token_type_ids, compress_token_type_ids,
get_score_prompt, get_score_prompt,
validate_score_input,
) )
from vllm.entrypoints.utils import log_non_default_args from vllm.entrypoints.utils import log_non_default_args
from vllm.inputs import ( from vllm.inputs import (
...@@ -1326,8 +1326,8 @@ class LLM: ...@@ -1326,8 +1326,8 @@ class LLM:
def _embedding_score( def _embedding_score(
self, self,
text_1: list[SingletonPrompt], data_1: list[ScoreData],
text_2: list[SingletonPrompt], data_2: list[ScoreData],
*, *,
use_tqdm: bool | Callable[..., tqdm], use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None, pooling_params: PoolingParams | None,
...@@ -1336,8 +1336,16 @@ class LLM: ...@@ -1336,8 +1336,16 @@ class LLM:
) -> list[ScoringRequestOutput]: ) -> list[ScoringRequestOutput]:
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
input_texts: list[str] = []
for text in data_1 + data_2:
if not isinstance(text, str):
raise NotImplementedError(
"Embedding scores currently do not support multimodal input."
)
input_texts.append(text)
encoded_output = self.encode( encoded_output = self.encode(
text_1 + text_2, input_texts,
use_tqdm=use_tqdm, use_tqdm=use_tqdm,
lora_request=lora_request, lora_request=lora_request,
pooling_params=pooling_params, pooling_params=pooling_params,
...@@ -1345,8 +1353,8 @@ class LLM: ...@@ -1345,8 +1353,8 @@ class LLM:
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
) )
encoded_output_1 = encoded_output[0 : len(text_1)] encoded_output_1 = encoded_output[0 : len(data_1)]
encoded_output_2 = encoded_output[len(text_1) :] encoded_output_2 = encoded_output[len(data_1) :]
if len(encoded_output_1) == 1: if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2) encoded_output_1 = encoded_output_1 * len(encoded_output_2)
...@@ -1362,8 +1370,8 @@ class LLM: ...@@ -1362,8 +1370,8 @@ class LLM:
def _cross_encoding_score( def _cross_encoding_score(
self, self,
data_1: list[str] | list[ScoreContentPartParam], data_1: list[ScoreData],
data_2: list[str] | list[ScoreContentPartParam], data_2: list[ScoreData],
*, *,
use_tqdm: bool | Callable[..., tqdm], use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None, pooling_params: PoolingParams | None,
...@@ -1424,8 +1432,14 @@ class LLM: ...@@ -1424,8 +1432,14 @@ class LLM:
def score( def score(
self, self,
data_1: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam, data_1: SingletonPrompt
data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam, | Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
data_2: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
/, /,
*, *,
use_tqdm: bool | Callable[..., tqdm] = True, use_tqdm: bool | Callable[..., tqdm] = True,
...@@ -1501,73 +1515,23 @@ class LLM: ...@@ -1501,73 +1515,23 @@ class LLM:
"chat_template is only supported for cross-encoder models." "chat_template is only supported for cross-encoder models."
) )
# the tokenizer for models such as is_multimodal_model = model_config.is_multimodal_model
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing architecture = model_config.architecture
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.get_tokenizer()
if not model_config.is_multimodal_model:
def check_data_type(
data: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam,
):
if isinstance(data, dict) and "content" in data:
raise ValueError(
"ScoreMultiModalParam is not supported "
f"for {model_config.architecture}"
)
check_data_type(data_1)
check_data_type(data_2)
def ensure_str(prompt: SingletonPrompt): score_data_1, score_data_2 = validate_score_input(
if isinstance(prompt, dict): data_1, # type: ignore[arg-type]
if "multi_modal_data" in prompt: data_2, # type: ignore[arg-type]
raise ValueError( is_multimodal_model=is_multimodal_model,
"Multi-modal prompt is not supported for scoring" architecture=architecture,
)
elif "prompt_token_ids" in prompt:
prompt = tokenizer.decode(
cast(TokensPrompt, prompt)["prompt_token_ids"]
) )
elif "prompt" in prompt:
prompt = cast(TextPrompt, prompt)["prompt"]
assert type(prompt) is str
return prompt
if isinstance(data_1, (str, dict)):
# Convert a single prompt to a list.
data_1 = [data_1] # type: ignore[list-item]
data_1 = [ensure_str(t) for t in data_1]
if isinstance(data_2, (str, dict)):
# Convert a single prompt to a list.
data_2 = [data_2] # type: ignore[list-item]
data_2 = [ensure_str(t) for t in data_2]
if isinstance(data_1, dict) and "content" in data_1:
data_1 = data_1.get("content") # type: ignore[assignment]
elif isinstance(data_1, str):
data_1 = [data_1]
if isinstance(data_2, dict) and "content" in data_2:
data_2 = data_2.get("content") # type: ignore[assignment]
elif isinstance(data_2, str):
data_2 = [data_2]
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
tok_params = self._get_cmpl_tok_params(tokenization_kwargs) tok_params = self._get_cmpl_tok_params(tokenization_kwargs)
encode_kwargs = tok_params.get_encode_kwargs() encode_kwargs = tok_params.get_encode_kwargs()
if model_config.is_cross_encoder: if model_config.is_cross_encoder:
return self._cross_encoding_score( return self._cross_encoding_score(
data_1, # type: ignore[arg-type] score_data_1,
data_2, # type: ignore[arg-type] score_data_2,
use_tqdm=use_tqdm, use_tqdm=use_tqdm,
pooling_params=pooling_params, pooling_params=pooling_params,
lora_request=lora_request, lora_request=lora_request,
...@@ -1576,8 +1540,8 @@ class LLM: ...@@ -1576,8 +1540,8 @@ class LLM:
) )
else: else:
return self._embedding_score( return self._embedding_score(
data_1, # type: ignore[arg-type] score_data_1,
data_2, # type: ignore[arg-type] score_data_2,
use_tqdm=use_tqdm, use_tqdm=use_tqdm,
pooling_params=pooling_params, pooling_params=pooling_params,
lora_request=lora_request, lora_request=lora_request,
......
...@@ -14,7 +14,8 @@ from vllm.entrypoints.pooling.base.protocol import ( ...@@ -14,7 +14,8 @@ from vllm.entrypoints.pooling.base.protocol import (
) )
from vllm.entrypoints.pooling.score.utils import ( from vllm.entrypoints.pooling.score.utils import (
ScoreContentPartParam, ScoreContentPartParam,
ScoreMultiModalParam, ScoreInput,
ScoreInputs,
) )
from vllm.renderers import TokenizeParams from vllm.renderers import TokenizeParams
from vllm.utils import random_uuid from vllm.utils import random_uuid
...@@ -47,13 +48,13 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): ...@@ -47,13 +48,13 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
class ScoreDataRequest(ScoreRequestMixin): class ScoreDataRequest(ScoreRequestMixin):
data_1: list[str] | str | ScoreMultiModalParam data_1: ScoreInputs
data_2: list[str] | str | ScoreMultiModalParam data_2: ScoreInputs
class ScoreQueriesDocumentsRequest(ScoreRequestMixin): class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
queries: list[str] | str | ScoreMultiModalParam queries: ScoreInputs
documents: list[str] | str | ScoreMultiModalParam documents: ScoreInputs
@property @property
def data_1(self): def data_1(self):
...@@ -64,9 +65,22 @@ class ScoreQueriesDocumentsRequest(ScoreRequestMixin): ...@@ -64,9 +65,22 @@ class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
return self.documents return self.documents
class ScoreQueriesItemsRequest(ScoreRequestMixin):
queries: ScoreInputs
items: ScoreInputs
@property
def data_1(self):
return self.queries
@property
def data_2(self):
return self.items
class ScoreTextRequest(ScoreRequestMixin): class ScoreTextRequest(ScoreRequestMixin):
text_1: list[str] | str | ScoreMultiModalParam text_1: ScoreInputs
text_2: list[str] | str | ScoreMultiModalParam text_2: ScoreInputs
@property @property
def data_1(self): def data_1(self):
...@@ -78,13 +92,16 @@ class ScoreTextRequest(ScoreRequestMixin): ...@@ -78,13 +92,16 @@ class ScoreTextRequest(ScoreRequestMixin):
ScoreRequest: TypeAlias = ( ScoreRequest: TypeAlias = (
ScoreQueriesDocumentsRequest | ScoreDataRequest | ScoreTextRequest ScoreQueriesDocumentsRequest
| ScoreQueriesItemsRequest
| ScoreDataRequest
| ScoreTextRequest
) )
class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin): class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
query: str | ScoreMultiModalParam query: ScoreInput
documents: list[str] | ScoreMultiModalParam documents: ScoreInputs
top_n: int = Field(default_factory=lambda: 0) top_n: int = Field(default_factory=lambda: 0)
# --8<-- [start:rerank-extra-params] # --8<-- [start:rerank-extra-params]
...@@ -108,7 +125,7 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin): ...@@ -108,7 +125,7 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
class RerankDocument(BaseModel): class RerankDocument(BaseModel):
text: str | None = None text: str | None = None
multi_modal: ScoreContentPartParam | None = None multi_modal: list[ScoreContentPartParam] | None = None
class RerankResult(BaseModel): class RerankResult(BaseModel):
......
...@@ -27,12 +27,12 @@ from vllm.entrypoints.pooling.score.protocol import ( ...@@ -27,12 +27,12 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreResponseData, ScoreResponseData,
) )
from vllm.entrypoints.pooling.score.utils import ( from vllm.entrypoints.pooling.score.utils import (
ScoreContentPartParam, ScoreData,
ScoreMultiModalParam, ScoreInputs,
_cosine_similarity, _cosine_similarity,
_validate_score_input_lens,
compress_token_type_ids, compress_token_type_ids,
get_score_prompt, get_score_prompt,
validate_score_input,
) )
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -65,15 +65,32 @@ class ServingScores(OpenAIServing): ...@@ -65,15 +65,32 @@ class ServingScores(OpenAIServing):
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
self.is_cross_encoder = self.model_config.is_cross_encoder
self.is_multimodal_model = self.model_config.is_multimodal_model
self.architecture = self.model_config.architecture
if self.is_cross_encoder:
self._score_func = self._cross_encoding_score
else:
self._score_func = self._embedding_score
async def _embedding_score( async def _embedding_score(
self, self,
data_1: list[str], data_1: list[ScoreData],
data_2: list[str], data_2: list[ScoreData],
request: RerankRequest | ScoreRequest, request: RerankRequest | ScoreRequest,
request_id: str, request_id: str,
lora_request: LoRARequest | None | None = None, lora_request: LoRARequest | None | None = None,
trace_headers: Mapping[str, str] | None = None, trace_headers: Mapping[str, str] | None = None,
) -> list[PoolingRequestOutput] | ErrorResponse: ) -> list[PoolingRequestOutput] | ErrorResponse:
input_texts: list[str] = []
for text in data_1 + data_2:
if not isinstance(text, str):
raise NotImplementedError(
"Embedding scores currently do not support multimodal input."
)
input_texts.append(text)
model_config = self.model_config model_config = self.model_config
tokenizer = self.renderer.get_tokenizer() tokenizer = self.renderer.get_tokenizer()
...@@ -82,8 +99,6 @@ class ServingScores(OpenAIServing): ...@@ -82,8 +99,6 @@ class ServingScores(OpenAIServing):
executor=self._tokenizer_executor, executor=self._tokenizer_executor,
) )
input_texts = data_1 + data_2
tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs() tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
tokenized_prompts = await asyncio.gather( tokenized_prompts = await asyncio.gather(
*(encode_async(t, **tokenization_kwargs) for t in input_texts) *(encode_async(t, **tokenization_kwargs) for t in input_texts)
...@@ -157,60 +172,30 @@ class ServingScores(OpenAIServing): ...@@ -157,60 +172,30 @@ class ServingScores(OpenAIServing):
return final_res_batch return final_res_batch
def _preprocess_score(
self,
request: RerankRequest | ScoreRequest,
tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
) -> tuple[str, TokensPrompt]:
model_config = self.model_config
full_prompt, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=data_1,
data_2=data_2,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
score_template=self.score_template,
)
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
if request.mm_processor_kwargs is not None:
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
return full_prompt, engine_prompt
async def _cross_encoding_score( async def _cross_encoding_score(
self, self,
data_1: list[str] | list[ScoreContentPartParam], data_1: list[ScoreData],
data_2: list[str] | list[ScoreContentPartParam], data_2: list[ScoreData],
request: RerankRequest | ScoreRequest, request: RerankRequest | ScoreRequest,
request_id: str, request_id: str,
lora_request: LoRARequest | None | None = None, lora_request: LoRARequest | None | None = None,
trace_headers: Mapping[str, str] | None = None, trace_headers: Mapping[str, str] | None = None,
) -> list[PoolingRequestOutput] | ErrorResponse: ) -> list[PoolingRequestOutput] | ErrorResponse:
model_config = self.model_config
tokenizer = self.renderer.get_tokenizer() tokenizer = self.renderer.get_tokenizer()
if isinstance(tokenizer, MistralTokenizer):
raise ValueError("MistralTokenizer not supported for cross-encoding")
request_prompts: list[str] = [] model_config = self.model_config
engine_prompts: list[TokensPrompt] = []
if len(data_1) == 1: if len(data_1) == 1:
data_1 = data_1 * len(data_2) data_1 = data_1 * len(data_2)
if isinstance(tokenizer, MistralTokenizer):
raise ValueError("MistralTokenizer not supported for cross-encoding")
tok_kwargs = request.build_tok_params(model_config).get_encode_kwargs() tok_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
preprocess_async = make_async( preprocess_async = make_async(
self._preprocess_score, self._preprocess_score,
executor=self._tokenizer_executor, executor=self._tokenizer_executor,
) )
preprocessed_prompts = await asyncio.gather( preprocessed_prompts = await asyncio.gather(
*( *(
preprocess_async( preprocess_async(
...@@ -224,6 +209,8 @@ class ServingScores(OpenAIServing): ...@@ -224,6 +209,8 @@ class ServingScores(OpenAIServing):
) )
) )
request_prompts: list[str] = []
engine_prompts: list[TokensPrompt] = []
for full_prompt, engine_prompt in preprocessed_prompts: for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt) request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt) engine_prompts.append(engine_prompt)
...@@ -278,10 +265,33 @@ class ServingScores(OpenAIServing): ...@@ -278,10 +265,33 @@ class ServingScores(OpenAIServing):
return [out for out in final_res_batch if out is not None] return [out for out in final_res_batch if out is not None]
def _preprocess_score(
self,
request: RerankRequest | ScoreRequest,
tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: ScoreData,
data_2: ScoreData,
) -> tuple[str, TokensPrompt]:
model_config = self.model_config
full_prompt, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=data_1,
data_2=data_2,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
score_template=self.score_template,
)
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
if request.mm_processor_kwargs is not None:
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
return full_prompt, engine_prompt
async def _run_scoring( async def _run_scoring(
self, self,
data_1: list[str] | str | ScoreMultiModalParam, data_1: ScoreInputs,
data_2: list[str] | str | ScoreMultiModalParam, data_2: ScoreInputs,
request: ScoreRequest | RerankRequest, request: ScoreRequest | RerankRequest,
request_id: str, request_id: str,
raw_request: Request | None = None, raw_request: Request | None = None,
...@@ -294,39 +304,16 @@ class ServingScores(OpenAIServing): ...@@ -294,39 +304,16 @@ class ServingScores(OpenAIServing):
else await self._get_trace_headers(raw_request.headers) else await self._get_trace_headers(raw_request.headers)
) )
if not self.model_config.is_multimodal_model and ( score_data_1, score_data_2 = validate_score_input(
isinstance(data_1, dict) or isinstance(data_2, dict) data_1,
): data_2,
raise ValueError( is_multimodal_model=self.is_multimodal_model,
f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501 architecture=self.architecture,
)
if isinstance(data_1, str):
data_1 = [data_1]
elif isinstance(data_1, dict):
data_1 = data_1.get("content") # type: ignore[assignment]
if isinstance(data_2, str):
data_2 = [data_2]
elif isinstance(data_2, dict):
data_2 = data_2.get("content") # type: ignore[assignment]
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
if self.model_config.is_cross_encoder:
return await self._cross_encoding_score(
data_1=data_1, # type: ignore[arg-type]
data_2=data_2, # type: ignore[arg-type]
request=request,
request_id=request_id,
lora_request=lora_request,
trace_headers=trace_headers,
) )
else: return await self._score_func(
return await self._embedding_score( data_1=score_data_1,
data_1=data_1, # type: ignore[arg-type] data_2=score_data_2,
data_2=data_2, # type: ignore[arg-type]
request=request, request=request,
request_id=request_id, request_id=request_id,
lora_request=lora_request, lora_request=lora_request,
...@@ -391,15 +378,6 @@ class ServingScores(OpenAIServing): ...@@ -391,15 +378,6 @@ class ServingScores(OpenAIServing):
request_id = f"rerank-{self._base_request_id(raw_request)}" request_id = f"rerank-{self._base_request_id(raw_request)}"
documents = request.documents documents = request.documents
top_n = (
request.top_n
if request.top_n > 0
else (
len(documents)
if isinstance(documents, list)
else len(documents["content"])
)
)
try: try:
final_res_batch = await self._run_scoring( final_res_batch = await self._run_scoring(
...@@ -412,6 +390,8 @@ class ServingScores(OpenAIServing): ...@@ -412,6 +390,8 @@ class ServingScores(OpenAIServing):
if isinstance(final_res_batch, ErrorResponse): if isinstance(final_res_batch, ErrorResponse):
return final_res_batch return final_res_batch
top_n = request.top_n if request.top_n > 0 else len(final_res_batch)
return self.request_output_to_rerank_response( return self.request_output_to_rerank_response(
final_res_batch, final_res_batch,
request_id, request_id,
...@@ -465,22 +445,32 @@ class ServingScores(OpenAIServing): ...@@ -465,22 +445,32 @@ class ServingScores(OpenAIServing):
final_res_batch: list[PoolingRequestOutput], final_res_batch: list[PoolingRequestOutput],
request_id: str, request_id: str,
model_name: str, model_name: str,
documents: list[str] | ScoreMultiModalParam, documents: ScoreInputs,
top_n: int, top_n: int,
) -> RerankResponse: ) -> RerankResponse:
""" """
Convert the output of do_rank to a RerankResponse Convert the output of do_rank to a RerankResponse
""" """
if not isinstance(documents, list):
documents = [documents]
results: list[RerankResult] = [] results: list[RerankResult] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch): for idx, final_res in enumerate(final_res_batch):
classify_res = ScoringRequestOutput.from_base(final_res) classify_res = ScoringRequestOutput.from_base(final_res)
document = documents[idx]
if isinstance(document, str):
rerank_document = RerankDocument(text=document)
else:
rerank_document = RerankDocument(
multi_modal=document.get("content", [])
)
result = RerankResult( result = RerankResult(
index=idx, index=idx,
document=RerankDocument(text=documents[idx]) document=rerank_document,
if isinstance(documents, list)
else RerankDocument(multi_modal=documents["content"][idx]),
relevance_score=classify_res.outputs.score, relevance_score=classify_res.outputs.score,
) )
results.append(result) results.append(result)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Any, TypeAlias, cast from typing import Any, TypeAlias, cast
from torch.nn import CosineSimilarity from torch.nn import CosineSimilarity
...@@ -10,12 +11,13 @@ from vllm.entrypoints.chat_utils import ( ...@@ -10,12 +11,13 @@ from vllm.entrypoints.chat_utils import (
BaseMultiModalItemTracker, BaseMultiModalItemTracker,
ChatCompletionContentPartImageEmbedsParam, ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
ChatCompletionContentPartVideoParam, ChatCompletionContentPartVideoParam,
ChatTemplateResolutionError, ChatTemplateResolutionError,
ConversationMessage,
MultiModalItemTracker, MultiModalItemTracker,
_ContentPart, _parse_chat_message_content_parts,
_parse_chat_message_content_part,
) )
from vllm.inputs import TokensPrompt from vllm.inputs import TokensPrompt
from vllm.model_executor.models.interfaces import supports_score_template from vllm.model_executor.models.interfaces import supports_score_template
...@@ -46,6 +48,13 @@ class ScoreMultiModalParam(TypedDict, total=False): ...@@ -46,6 +48,13 @@ class ScoreMultiModalParam(TypedDict, total=False):
"""The multimodal contents""" """The multimodal contents"""
# Raw input data with content key in ScoreMultiModalParam.
ScoreInput = str | ScoreMultiModalParam
ScoreInputs = ScoreInput | list[ScoreInput]
# Score data without content key.
ScoreData = str | list[ScoreContentPartParam]
def _cosine_similarity( def _cosine_similarity(
tokenizer: TokenizerLike, tokenizer: TokenizerLike,
embed_1: list[PoolingRequestOutput], embed_1: list[PoolingRequestOutput],
...@@ -77,8 +86,8 @@ def _cosine_similarity( ...@@ -77,8 +86,8 @@ def _cosine_similarity(
def _validate_score_input_lens( def _validate_score_input_lens(
data_1: list[str] | list[ScoreContentPartParam], data_1: list[ScoreData],
data_2: list[str] | list[ScoreContentPartParam], data_2: list[ScoreData],
): ):
len_1 = len(data_1) len_1 = len(data_1)
len_2 = len(data_2) len_2 = len(data_2)
...@@ -91,19 +100,56 @@ def _validate_score_input_lens( ...@@ -91,19 +100,56 @@ def _validate_score_input_lens(
raise ValueError("At least one text_pair element must be given") raise ValueError("At least one text_pair element must be given")
def _validate_mm_score_input(
data: list[ScoreInput],
is_multimodal_model: bool,
architecture: str,
) -> list[ScoreData]:
out: list[ScoreData] = []
for d in data:
if isinstance(d, str):
out.append(d)
else:
if not is_multimodal_model:
raise ValueError(f"MultiModalParam is not supported for {architecture}")
content = cast(list[ScoreContentPartParam], d.get("content", []))
out.append(content)
return out
def validate_score_input(
data_1: ScoreInputs,
data_2: ScoreInputs,
is_multimodal_model: bool,
architecture: str,
) -> tuple[list[ScoreData], list[ScoreData]]:
if not isinstance(data_1, list):
data_1 = [data_1]
if not isinstance(data_2, list):
data_2 = [data_2]
score_input_1 = _validate_mm_score_input(data_1, is_multimodal_model, architecture)
score_input_2 = _validate_mm_score_input(data_2, is_multimodal_model, architecture)
_validate_score_input_lens(score_input_1, score_input_2)
return score_input_1, score_input_2
def parse_score_data( def parse_score_data(
data_1: str | ScoreContentPartParam, data_1: ScoreData,
data_2: str | ScoreContentPartParam, data_2: ScoreData,
model_config: ModelConfig, model_config: ModelConfig,
) -> tuple[str, str, MultiModalDataDict | None, MultiModalUUIDDict | None]: ) -> tuple[str, str, MultiModalDataDict | None, MultiModalUUIDDict | None]:
mm_tracker = MultiModalItemTracker(model_config) mm_tracker = MultiModalItemTracker(model_config)
content_1 = _parse_score_content(data_1, mm_tracker) content_1 = _parse_score_content("query", data_1, mm_tracker)
content_2 = _parse_score_content(data_2, mm_tracker) content_2 = _parse_score_content("document", data_2, mm_tracker)
def ensure_str(content: _ContentPart | None) -> str: def ensure_str(content: list[ConversationMessage]) -> str:
if content is not None and isinstance(content, str): assert len(content) == 1
return cast(str, content) prompt = content[0]["content"]
if prompt is not None and isinstance(prompt, str):
return cast(str, prompt)
else: else:
raise ValueError(f"Only string content is supported, but got {content}.") raise ValueError(f"Only string content is supported, but got {content}.")
...@@ -115,19 +161,22 @@ def parse_score_data( ...@@ -115,19 +161,22 @@ def parse_score_data(
def _parse_score_content( def _parse_score_content(
data: str | ScoreContentPartParam, role: str,
data: ScoreData,
mm_tracker: BaseMultiModalItemTracker, mm_tracker: BaseMultiModalItemTracker,
) -> _ContentPart | None: ) -> list[ConversationMessage]:
parts: Iterable[ChatCompletionContentPartParam]
if isinstance(data, str): if isinstance(data, str):
part = ChatCompletionContentPartTextParam(type="text", text=data) parts = [ChatCompletionContentPartTextParam(type="text", text=data)]
else: else:
part = data parts = cast(Iterable[ChatCompletionContentPartParam], data)
mm_parser = mm_tracker.create_parser() mm_parser = mm_tracker.create_parser()
parse_res = _parse_chat_message_content_part( parse_res = _parse_chat_message_content_parts(
part, role=role,
mm_parser, parts=parts,
mm_tracker=mm_tracker,
wrap_dicts=False, wrap_dicts=False,
interleave_strings=False, interleave_strings=False,
) )
...@@ -184,8 +233,8 @@ def get_score_prompt( ...@@ -184,8 +233,8 @@ def get_score_prompt(
model_config: ModelConfig, model_config: ModelConfig,
tokenizer: TokenizerLike, tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any], tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam, data_1: ScoreData,
data_2: str | ScoreContentPartParam, data_2: ScoreData,
score_template: str | None = None, score_template: str | None = None,
) -> tuple[str, TokensPrompt]: ) -> tuple[str, TokensPrompt]:
prompt_1, prompt_2, mm_data, mm_uuids = parse_score_data( prompt_1, prompt_2, mm_data, mm_uuids = parse_score_data(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment