Unverified Commit c88860d7 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend] Score entrypoint support data_1 & data_2 and queries & documents as inputs (#32577)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
parent 758df5af
...@@ -694,7 +694,7 @@ Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](. ...@@ -694,7 +694,7 @@ Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](.
#### Single inference #### Single inference
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. You can pass a string to both `queries` and `documents`, forming a single sentence pair.
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
...@@ -704,8 +704,8 @@ curl -X 'POST' \ ...@@ -704,8 +704,8 @@ curl -X 'POST' \
-d '{ -d '{
"model": "BAAI/bge-reranker-v2-m3", "model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float", "encoding_format": "float",
"text_1": "What is the capital of France?", "queries": "What is the capital of France?",
"text_2": "The capital of France is Paris." "documents": "The capital of France is Paris."
}' }'
``` ```
...@@ -730,9 +730,9 @@ curl -X 'POST' \ ...@@ -730,9 +730,9 @@ curl -X 'POST' \
#### Batch inference #### Batch inference
You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs You can pass a string to `queries` and a list to `documents`, forming multiple sentence pairs
where each pair is built from `text_1` and a string in `text_2`. where each pair is built from `queries` and a string in `documents`.
The total number of pairs is `len(text_2)`. The total number of pairs is `len(documents)`.
??? console "Request" ??? console "Request"
...@@ -743,8 +743,8 @@ The total number of pairs is `len(text_2)`. ...@@ -743,8 +743,8 @@ The total number of pairs is `len(text_2)`.
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-d '{ -d '{
"model": "BAAI/bge-reranker-v2-m3", "model": "BAAI/bge-reranker-v2-m3",
"text_1": "What is the capital of France?", "queries": "What is the capital of France?",
"text_2": [ "documents": [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris." "The capital of France is Paris."
] ]
...@@ -775,9 +775,9 @@ The total number of pairs is `len(text_2)`. ...@@ -775,9 +775,9 @@ The total number of pairs is `len(text_2)`.
} }
``` ```
You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs You can pass a list to both `queries` and `documents`, forming multiple sentence pairs
where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). where each pair is built from a string in `queries` and the corresponding string in `documents` (similar to `zip()`).
The total number of pairs is `len(text_2)`. The total number of pairs is `len(documents)`.
??? console "Request" ??? console "Request"
...@@ -789,11 +789,11 @@ The total number of pairs is `len(text_2)`. ...@@ -789,11 +789,11 @@ The total number of pairs is `len(text_2)`.
-d '{ -d '{
"model": "BAAI/bge-reranker-v2-m3", "model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float", "encoding_format": "float",
"text_1": [ "queries": [
"What is the capital of Brazil?", "What is the capital of Brazil?",
"What is the capital of France?" "What is the capital of France?"
], ],
"text_2": [ "documents": [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris." "The capital of France is Paris."
] ]
...@@ -847,8 +847,8 @@ You can pass multi-modal inputs to scoring models by passing `content` including ...@@ -847,8 +847,8 @@ You can pass multi-modal inputs to scoring models by passing `content` including
"http://localhost:8000/v1/score", "http://localhost:8000/v1/score",
json={ json={
"model": "jinaai/jina-reranker-m0", "model": "jinaai/jina-reranker-m0",
"text_1": "slm markdown", "queries": "slm markdown",
"text_2": { "documents": {
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
......
...@@ -21,8 +21,8 @@ def parse_args(): ...@@ -21,8 +21,8 @@ def parse_args():
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
text_1 = "What is the capital of France?" query = "What is the capital of France?"
texts_2 = [ documents = [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "The capital of France is Paris.",
] ]
...@@ -32,13 +32,13 @@ def main(args: Namespace): ...@@ -32,13 +32,13 @@ def main(args: Namespace):
llm = LLM(**vars(args)) llm = LLM(**vars(args))
# Generate scores. The output is a list of ScoringRequestOutputs. # Generate scores. The output is a list of ScoringRequestOutputs.
outputs = llm.score(text_1, texts_2) outputs = llm.score(query, documents)
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
for text_2, output in zip(texts_2, outputs): for document, output in zip(documents, outputs):
score = output.outputs.score score = output.outputs.score
print(f"Pair: {[text_1, text_2]!r} \nScore: {score}") print(f"Pair: {[query, document]!r} \nScore: {score}")
print("-" * 60) print("-" * 60)
......
...@@ -255,8 +255,8 @@ cat results.jsonl ...@@ -255,8 +255,8 @@ cat results.jsonl
Add score requests to your batch file. The following is an example: Add score requests to your batch file. The following is an example:
```text ```text
{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
``` ```
You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model). You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
......
...@@ -50,8 +50,8 @@ documents = [ ...@@ -50,8 +50,8 @@ documents = [
# Request payload for the score API # Request payload for the score API
data = { data = {
"model": "Qwen/Qwen3-Reranker-0.6B", "model": "Qwen/Qwen3-Reranker-0.6B",
"text_1": queries, "queries": queries,
"text_2": documents, "documents": documents,
} }
......
...@@ -30,29 +30,35 @@ def main(args): ...@@ -30,29 +30,35 @@ def main(args):
api_url = f"http://{args.host}:{args.port}/score" api_url = f"http://{args.host}:{args.port}/score"
model_name = args.model model_name = args.model
text_1 = "What is the capital of Brazil?" queries = "What is the capital of Brazil?"
text_2 = "The capital of Brazil is Brasilia." documents = "The capital of Brazil is Brasilia."
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "queries": queries, "documents": documents}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("\nPrompt when text_1 and text_2 are both strings:") print("\nPrompt when queries and documents are both strings:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("\nScore Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
text_1 = "What is the capital of France?" queries = "What is the capital of France?"
text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] documents = [
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} "The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
prompt = {"model": model_name, "queries": queries, "documents": documents}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("\nPrompt when text_1 is string and text_2 is a list:") print("\nPrompt when queries is string and documents is a list:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("\nScore Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
text_1 = ["What is the capital of Brazil?", "What is the capital of France?"] queries = ["What is the capital of Brazil?", "What is the capital of France?"]
text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] documents = [
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} "The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
prompt = {"model": model_name, "queries": queries, "documents": documents}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("\nPrompt when text_1 and text_2 are both lists:") print("\nPrompt when queries and documents are both lists:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("\nScore Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
......
...@@ -18,10 +18,22 @@ e.g. ...@@ -18,10 +18,22 @@ e.g.
""" """
import argparse import argparse
import base64
import json import json
import requests import requests
def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url, headers=headers) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return {"url": f"data:image/jpeg;base64,{result}"}
headers = {"accept": "application/json", "Content-Type": "application/json"} headers = {"accept": "application/json", "Content-Type": "application/json"}
query = "A woman playing with her dog on a beach at sunset." query = "A woman playing with her dog on a beach at sunset."
...@@ -30,8 +42,8 @@ documents = { ...@@ -30,8 +42,8 @@ documents = {
{ {
"type": "text", "type": "text",
"text": ( "text": (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501 "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501 "as the dog offers its paw in a heartwarming display of companionship and trust."
), ),
}, },
{ {
...@@ -40,6 +52,12 @@ documents = { ...@@ -40,6 +52,12 @@ documents = {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
}, },
}, },
{
"type": "image_url",
"image_url": encode_base64_content_from_url(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
),
},
] ]
} }
......
...@@ -17,15 +17,27 @@ e.g. ...@@ -17,15 +17,27 @@ e.g.
""" """
import argparse import argparse
import base64
import json import json
import pprint import pprint
import requests import requests
def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url, headers=headers) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return {"url": f"data:image/jpeg;base64,{result}"}
headers = {"accept": "application/json", "Content-Type": "application/json"} headers = {"accept": "application/json", "Content-Type": "application/json"}
text_1 = "slm markdown" queries = "slm markdown"
text_2 = { documents = {
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
...@@ -39,6 +51,12 @@ text_2 = { ...@@ -39,6 +51,12 @@ text_2 = {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}, },
}, },
{
"type": "image_url",
"image_url": encode_base64_content_from_url(
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
),
},
] ]
} }
...@@ -58,9 +76,9 @@ def main(args): ...@@ -58,9 +76,9 @@ def main(args):
response = requests.get(models_url, headers=headers) response = requests.get(models_url, headers=headers)
model = response.json()["data"][0]["id"] model = response.json()["data"][0]["id"]
prompt = {"model": model, "text_1": text_1, "text_2": text_2} prompt = {"model": model, "queries": queries, "documents": documents}
response = requests.post(score_url, headers=headers, json=prompt) response = requests.post(score_url, headers=headers, json=prompt)
print("\nPrompt when text_1 is string and text_2 is a image list:") print("\nPrompt when queries is string and documents is a image list:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("\nScore Response:") print("\nScore Response:")
print(json.dumps(response.json(), indent=2)) print(json.dumps(response.json(), indent=2))
......
...@@ -32,8 +32,8 @@ INPUT_EMBEDDING_BATCH = ( ...@@ -32,8 +32,8 @@ INPUT_EMBEDDING_BATCH = (
'{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}' '{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
) )
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
......
...@@ -251,8 +251,8 @@ async def test_score(server: RemoteOpenAIServer, model_name: str): ...@@ -251,8 +251,8 @@ async def test_score(server: RemoteOpenAIServer, model_name: str):
server.url_for("score"), server.url_for("score"),
json={ json={
"model": model_name, "model": model_name,
"text_1": "ping", "queries": "ping",
"text_2": "pong", "documents": "pong",
}, },
) )
assert response.json()["error"]["type"] == "BadRequestError" assert response.json()["error"]["type"] == "BadRequestError"
......
...@@ -43,12 +43,12 @@ def llm(): ...@@ -43,12 +43,12 @@ def llm():
def test_pooling_params(llm: LLM): def test_pooling_params(llm: LLM):
def get_outputs(use_activation): def get_outputs(use_activation):
text_1 = "What is the capital of France?" queries = "What is the capital of France?"
text_2 = "The capital of France is Paris." documents = "The capital of France is Paris."
outputs = llm.score( outputs = llm.score(
text_1, queries,
text_2, documents,
pooling_params=PoolingParams(use_activation=use_activation), pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False, use_tqdm=False,
) )
......
...@@ -61,14 +61,40 @@ def runner(model: dict[str, Any], hf_runner): ...@@ -61,14 +61,40 @@ def runner(model: dict[str, Any], hf_runner):
class TestModel: class TestModel:
def test_text_1_str_text_2_list( def test_queries_str_documents_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries, documents]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner self, server: RemoteOpenAIServer, model: dict[str, Any], runner
): ):
text_1 = "What is the capital of France?" text_1 = "What is the capital of France?"
text_2 = [ text_2 = "The capital of France is Paris."
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post( score_response = requests.post(
server.url_for("score"), server.url_for("score"),
...@@ -83,24 +109,50 @@ class TestModel: ...@@ -83,24 +109,50 @@ class TestModel:
assert score.id is not None assert score.id is not None
assert score.data is not None assert score.data is not None
assert len(score.data) == 2 assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data] vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]] text_pairs = [[text_1, text_2]]
hf_outputs = run_transformers(runner, model, text_pairs) hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)): for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_list_text_2_list( def test_data_1_str_data_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner self, server: RemoteOpenAIServer, model: dict[str, Any], runner
): ):
text_1 = [ data_1 = "What is the capital of France?"
"What is the capital of the United States?", data_2 = "The capital of France is Paris."
"What is the capital of France?",
] score_response = requests.post(
text_2 = [ server.url_for("score"),
json={
"model": model["name"],
"data_1": data_1,
"data_2": data_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[data_1, data_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_queries_str_documents_list(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "The capital of France is Paris.",
] ]
...@@ -109,8 +161,8 @@ class TestModel: ...@@ -109,8 +161,8 @@ class TestModel:
server.url_for("score"), server.url_for("score"),
json={ json={
"model": model["name"], "model": model["name"],
"text_1": text_1, "queries": queries,
"text_2": text_2, "documents": documents,
}, },
) )
score_response.raise_for_status() score_response.raise_for_status()
...@@ -122,24 +174,30 @@ class TestModel: ...@@ -122,24 +174,30 @@ class TestModel:
vllm_outputs = [d.score for d in score.data] vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]] text_pairs = [[queries, documents[0]], [queries, documents[1]]]
hf_outputs = run_transformers(runner, model, text_pairs) hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)): for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str( def test_queries_list_documents_list(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner self, server: RemoteOpenAIServer, model: dict[str, Any], runner
): ):
text_1 = "What is the capital of France?" queries = [
text_2 = "The capital of France is Paris." "What is the capital of the United States?",
"What is the capital of France?",
]
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post( score_response = requests.post(
server.url_for("score"), server.url_for("score"),
json={ json={
"model": model["name"], "model": model["name"],
"text_1": text_1, "queries": queries,
"text_2": text_2, "documents": documents,
}, },
) )
score_response.raise_for_status() score_response.raise_for_status()
...@@ -147,11 +205,11 @@ class TestModel: ...@@ -147,11 +205,11 @@ class TestModel:
assert score.id is not None assert score.id is not None
assert score.data is not None assert score.data is not None
assert len(score.data) == 1 assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data] vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2]] text_pairs = [[queries[0], documents[0]], [queries[1], documents[1]]]
hf_outputs = run_transformers(runner, model, text_pairs) hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)): for i in range(len(vllm_outputs)):
...@@ -160,8 +218,8 @@ class TestModel: ...@@ -160,8 +218,8 @@ class TestModel:
def test_score_max_model_len( def test_score_max_model_len(
self, server: RemoteOpenAIServer, model: dict[str, Any] self, server: RemoteOpenAIServer, model: dict[str, Any]
): ):
text_1 = "What is the capital of France?" * 20 queries = "What is the capital of France?" * 20
text_2 = [ documents = [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "The capital of France is Paris.",
] ]
...@@ -170,8 +228,8 @@ class TestModel: ...@@ -170,8 +228,8 @@ class TestModel:
server.url_for("score"), server.url_for("score"),
json={ json={
"model": model["name"], "model": model["name"],
"text_1": text_1, "queries": queries,
"text_2": text_2, "documents": documents,
}, },
) )
assert score_response.status_code == 400 assert score_response.status_code == 400
...@@ -183,8 +241,8 @@ class TestModel: ...@@ -183,8 +241,8 @@ class TestModel:
server.url_for("score"), server.url_for("score"),
json={ json={
"model": model["name"], "model": model["name"],
"text_1": text_1, "queries": queries,
"text_2": text_2, "documents": documents,
"truncate_prompt_tokens": 101, "truncate_prompt_tokens": 101,
}, },
) )
...@@ -192,13 +250,13 @@ class TestModel: ...@@ -192,13 +250,13 @@ class TestModel:
assert "Please, select a smaller truncation size." in score_response.text assert "Please, select a smaller truncation size." in score_response.text
def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]): def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
text_1 = "What is the capital of France?" queries = "What is the capital of France?"
text_2 = "The capital of France is Paris." documents = "The capital of France is Paris."
request_args = { request_args = {
"model": model["name"], "model": model["name"],
"text_1": text_1, "queries": queries,
"text_2": text_2, "documents": documents,
} }
score_response = requests.post(server.url_for("score"), json=request_args) score_response = requests.post(server.url_for("score"), json=request_args)
...@@ -225,14 +283,14 @@ class TestModel: ...@@ -225,14 +283,14 @@ class TestModel:
def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
def get_outputs(use_activation): def get_outputs(use_activation):
text_1 = "What is the capital of France?" queries = "What is the capital of France?"
text_2 = "The capital of France is Paris." documents = "The capital of France is Paris."
response = requests.post( response = requests.post(
server.url_for("score"), server.url_for("score"),
json={ json={
"model": model["name"], "model": model["name"],
"text_1": text_1, "queries": queries,
"text_2": text_2, "documents": documents,
"use_activation": use_activation, "use_activation": use_activation,
}, },
) )
......
...@@ -117,8 +117,8 @@ class ScoreClientMtebEncoder(MtebCrossEncoderMixin): ...@@ -117,8 +117,8 @@ class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
self.url, self.url,
json={ json={
"model": self.model_name, "model": self.model_name,
"text_1": query, "queries": query,
"text_2": corpus, "documents": corpus,
"truncate_prompt_tokens": -1, "truncate_prompt_tokens": -1,
}, },
).json() ).json()
......
...@@ -84,8 +84,11 @@ from vllm.entrypoints.pooling.pooling.protocol import ( ...@@ -84,8 +84,11 @@ from vllm.entrypoints.pooling.pooling.protocol import (
) )
from vllm.entrypoints.pooling.score.protocol import ( from vllm.entrypoints.pooling.score.protocol import (
RerankRequest, RerankRequest,
ScoreDataRequest,
ScoreQueriesDocumentsRequest,
ScoreRequest, ScoreRequest,
ScoreResponse, ScoreResponse,
ScoreTextRequest,
) )
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
...@@ -1032,7 +1035,9 @@ class OpenAIServing: ...@@ -1032,7 +1035,9 @@ class OpenAIServing:
( (
EmbeddingChatRequest, EmbeddingChatRequest,
EmbeddingCompletionRequest, EmbeddingCompletionRequest,
ScoreRequest, ScoreDataRequest,
ScoreTextRequest,
ScoreQueriesDocumentsRequest,
RerankRequest, RerankRequest,
ClassificationCompletionRequest, ClassificationCompletionRequest,
ClassificationChatRequest, ClassificationChatRequest,
...@@ -1042,7 +1047,9 @@ class OpenAIServing: ...@@ -1042,7 +1047,9 @@ class OpenAIServing:
# since these requests don't generate tokens. # since these requests don't generate tokens.
if token_num > self.max_model_len: if token_num > self.max_model_len:
operations: dict[type[AnyRequest], str] = { operations: dict[type[AnyRequest], str] = {
ScoreRequest: "score", ScoreDataRequest: "score",
ScoreTextRequest: "score",
ScoreQueriesDocumentsRequest: "score",
ClassificationCompletionRequest: "classification", ClassificationCompletionRequest: "classification",
ClassificationChatRequest: "classification", ClassificationChatRequest: "classification",
} }
......
...@@ -85,7 +85,7 @@ class BatchRequestInput(OpenAIBaseModel): ...@@ -85,7 +85,7 @@ class BatchRequestInput(OpenAIBaseModel):
if url == "/v1/embeddings": if url == "/v1/embeddings":
return TypeAdapter(EmbeddingRequest).validate_python(value) return TypeAdapter(EmbeddingRequest).validate_python(value)
if url.endswith("/score"): if url.endswith("/score"):
return ScoreRequest.model_validate(value) return TypeAdapter(ScoreRequest).validate_python(value)
if url.endswith("/rerank"): if url.endswith("/rerank"):
return RerankRequest.model_validate(value) return RerankRequest.model_validate(value)
return TypeAdapter(BatchRequestInputBody).validate_python(value) return TypeAdapter(BatchRequestInputBody).validate_python(value)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time
from typing import Any from typing import Any, TypeAlias
from pydantic import ( from pydantic import (
BaseModel, BaseModel,
...@@ -19,10 +19,7 @@ from vllm.entrypoints.pooling.score.utils import ( ...@@ -19,10 +19,7 @@ from vllm.entrypoints.pooling.score.utils import (
from vllm.utils import random_uuid from vllm.utils import random_uuid
class ScoreRequest(PoolingBasicRequestMixin): class ScoreRequestMixin(PoolingBasicRequestMixin):
text_1: list[str] | str | ScoreMultiModalParam
text_2: list[str] | str | ScoreMultiModalParam
# --8<-- [start:score-extra-params] # --8<-- [start:score-extra-params]
mm_processor_kwargs: dict[str, Any] | None = Field( mm_processor_kwargs: dict[str, Any] | None = Field(
default=None, default=None,
...@@ -53,6 +50,42 @@ class ScoreRequest(PoolingBasicRequestMixin): ...@@ -53,6 +50,42 @@ class ScoreRequest(PoolingBasicRequestMixin):
) )
class ScoreDataRequest(ScoreRequestMixin):
data_1: list[str] | str | ScoreMultiModalParam
data_2: list[str] | str | ScoreMultiModalParam
class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
queries: list[str] | str | ScoreMultiModalParam
documents: list[str] | str | ScoreMultiModalParam
@property
def data_1(self):
return self.queries
@property
def data_2(self):
return self.documents
class ScoreTextRequest(ScoreRequestMixin):
text_1: list[str] | str | ScoreMultiModalParam
text_2: list[str] | str | ScoreMultiModalParam
@property
def data_1(self):
return self.text_1
@property
def data_2(self):
return self.text_2
ScoreRequest: TypeAlias = (
ScoreQueriesDocumentsRequest | ScoreDataRequest | ScoreTextRequest
)
class RerankRequest(PoolingBasicRequestMixin): class RerankRequest(PoolingBasicRequestMixin):
query: str | ScoreMultiModalParam query: str | ScoreMultiModalParam
documents: list[str] | ScoreMultiModalParam documents: list[str] | ScoreMultiModalParam
......
...@@ -66,15 +66,15 @@ class ServingScores(OpenAIServing): ...@@ -66,15 +66,15 @@ class ServingScores(OpenAIServing):
async def _embedding_score( async def _embedding_score(
self, self,
tokenizer: TokenizerLike, tokenizer: TokenizerLike,
texts_1: list[str], data_1: list[str],
texts_2: list[str], data_2: list[str],
request: RerankRequest | ScoreRequest, request: RerankRequest | ScoreRequest,
request_id: str, request_id: str,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
lora_request: LoRARequest | None | None = None, lora_request: LoRARequest | None | None = None,
trace_headers: Mapping[str, str] | None = None, trace_headers: Mapping[str, str] | None = None,
) -> list[PoolingRequestOutput] | ErrorResponse: ) -> list[PoolingRequestOutput] | ErrorResponse:
input_texts = texts_1 + texts_2 input_texts = data_1 + data_2
engine_prompts: list[TokensPrompt] = [] engine_prompts: list[TokensPrompt] = []
tokenize_async = make_async( tokenize_async = make_async(
...@@ -135,22 +135,22 @@ class ServingScores(OpenAIServing): ...@@ -135,22 +135,22 @@ class ServingScores(OpenAIServing):
async for i, res in result_generator: async for i, res in result_generator:
embeddings[i] = res embeddings[i] = res
emb_texts_1: list[PoolingRequestOutput] = [] emb_data_1: list[PoolingRequestOutput] = []
emb_texts_2: list[PoolingRequestOutput] = [] emb_data_2: list[PoolingRequestOutput] = []
for i in range(0, len(texts_1)): for i in range(0, len(data_1)):
assert (emb := embeddings[i]) is not None assert (emb := embeddings[i]) is not None
emb_texts_1.append(emb) emb_data_1.append(emb)
for i in range(len(texts_1), len(embeddings)): for i in range(len(data_1), len(embeddings)):
assert (emb := embeddings[i]) is not None assert (emb := embeddings[i]) is not None
emb_texts_2.append(emb) emb_data_2.append(emb)
if len(emb_texts_1) == 1: if len(emb_data_1) == 1:
emb_texts_1 = emb_texts_1 * len(emb_texts_2) emb_data_1 = emb_data_1 * len(emb_data_2)
final_res_batch = _cosine_similarity( final_res_batch = _cosine_similarity(
tokenizer=tokenizer, embed_1=emb_texts_1, embed_2=emb_texts_2 tokenizer=tokenizer, embed_1=emb_data_1, embed_2=emb_data_2
) )
return final_res_batch return final_res_batch
...@@ -333,8 +333,8 @@ class ServingScores(OpenAIServing): ...@@ -333,8 +333,8 @@ class ServingScores(OpenAIServing):
else: else:
return await self._embedding_score( return await self._embedding_score(
tokenizer=tokenizer, tokenizer=tokenizer,
texts_1=data_1, # type: ignore[arg-type] data_1=data_1, # type: ignore[arg-type]
texts_2=data_2, # type: ignore[arg-type] data_2=data_2, # type: ignore[arg-type]
request=request, request=request,
request_id=request_id, request_id=request_id,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
...@@ -361,8 +361,8 @@ class ServingScores(OpenAIServing): ...@@ -361,8 +361,8 @@ class ServingScores(OpenAIServing):
try: try:
final_res_batch = await self._run_scoring( final_res_batch = await self._run_scoring(
request.text_1, request.data_1,
request.text_2, request.data_2,
request, request,
request_id, request_id,
raw_request, raw_request,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment