"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "542a4059b2bb0f790e82822c8b9cbcf8cde91adb"
Unverified Commit d9d21eb8 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend][3/n] Improve pooling entrypoints | scoring. (#28631)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
parent f09daea2
...@@ -10,9 +10,7 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ...@@ -10,9 +10,7 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionStreamResponse, ChatCompletionStreamResponse,
ChatMessage, ChatMessage,
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import UsageInfo
UsageInfo,
)
async def accumulate_streaming_response( async def accumulate_streaming_response(
......
...@@ -105,7 +105,7 @@ def test_pooling_params(llm: LLM): ...@@ -105,7 +105,7 @@ def test_pooling_params(llm: LLM):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_score_api(llm: LLM): def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1." err_msg = "Scoring API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg): with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False) llm.score("ping", "pong", use_tqdm=False)
......
...@@ -390,7 +390,7 @@ async def test_use_activation(server: RemoteOpenAIServer, model_name: str): ...@@ -390,7 +390,7 @@ async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_score(server: RemoteOpenAIServer, model_name: str): async def test_score(server: RemoteOpenAIServer, model_name: str):
# score api is only enabled for num_labels == 1. # Scoring API is only enabled for num_labels == 1.
response = requests.post( response = requests.post(
server.url_for("score"), server.url_for("score"),
json={ json={
...@@ -405,7 +405,7 @@ async def test_score(server: RemoteOpenAIServer, model_name: str): ...@@ -405,7 +405,7 @@ async def test_score(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_rerank(server: RemoteOpenAIServer, model_name: str): async def test_rerank(server: RemoteOpenAIServer, model_name: str):
# rerank api is only enabled for num_labels == 1. # Scoring API is only enabled for num_labels == 1.
response = requests.post( response = requests.post(
server.url_for("rerank"), server.url_for("rerank"),
json={ json={
......
...@@ -7,7 +7,7 @@ import requests ...@@ -7,7 +7,7 @@ import requests
from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-base-en-v1.5" MODEL_NAME = "BAAI/bge-base-en-v1.5"
......
...@@ -8,7 +8,7 @@ import torch.nn.functional as F ...@@ -8,7 +8,7 @@ import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-reranker-base" MODEL_NAME = "BAAI/bge-reranker-base"
......
...@@ -7,7 +7,7 @@ import pytest ...@@ -7,7 +7,7 @@ import pytest
import requests import requests
from tests.utils import VLLM_PATH, RemoteOpenAIServer from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform from vllm.platforms import current_platform
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
from .util import make_base64_image, make_image_mm_param
MODEL_NAME = "vidore/colpali-v1.3-hf"
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_query_text_vs_docs_image(llm):
"""Score a text query against image documents via the multimodal path."""
red_image = make_base64_image(64, 64, color=(255, 0, 0))
blue_image = make_base64_image(64, 64, color=(0, 0, 255))
query = "Describe the red object"
image_docs = [
make_image_mm_param(red_image),
make_image_mm_param(blue_image),
]
scores = llm.score(query, image_docs)
assert len(scores) == 2
assert scores[0].outputs.score > scores[1].outputs.score
@pytest.mark.skip_global_cleanup
def test_query_text_vs_docs_mix(llm) -> None:
"""Score a text query against a mix of text and image documents."""
red_image = make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
make_image_mm_param(red_image),
]
scores = llm.score(query, documents)
assert len(scores) == 2
assert scores[0].outputs.score > scores[1].outputs.score
@pytest.mark.skip_global_cleanup
def test_query_image_vs_docs_text(llm) -> None:
"""Score an image query against text documents."""
red_image = make_base64_image(64, 64, color=(255, 0, 0))
image_query = make_image_mm_param(red_image, text="red color")
documents = [
"Describe the red object.",
"The capital of France is Paris.",
]
scores = llm.score(image_query, documents)
assert len(scores) == 2
assert scores[0].outputs.score > scores[1].outputs.score
...@@ -6,7 +6,7 @@ import pytest ...@@ -6,7 +6,7 @@ import pytest
import requests import requests
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from .util import ColBERTScoringHfRunner from .util import ColBERTScoringHfRunner
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
from tests.entrypoints.pooling.scoring.util import (
make_base64_image,
make_image_mm_param,
)
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
MODEL_NAME = "vidore/colpali-v1.3-hf"
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, []) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_score_api_query_text_vs_docs_image(server: RemoteOpenAIServer):
query = "Describe the red object"
red_image = make_base64_image(64, 64, color=(255, 0, 0))
blue_image = make_base64_image(64, 64, color=(0, 0, 255))
documents = [
make_image_mm_param(red_image),
make_image_mm_param(blue_image),
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": documents,
},
)
score_response.raise_for_status()
scores = ScoreResponse.model_validate(score_response.json())
assert scores.id is not None
assert scores.data is not None
assert len(scores.data) == 2
assert scores.data[0].score > scores.data[1].score
@pytest.mark.asyncio
async def test_score_api_query_text_vs_docs_mix(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
make_image_mm_param(red_image),
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": documents,
},
)
score_response.raise_for_status()
scores = ScoreResponse.model_validate(score_response.json())
assert scores.id is not None
assert scores.data is not None
assert len(scores.data) == 2
assert scores.data[0].score > scores.data[1].score
@pytest.mark.asyncio
async def test_score_api_query_image_vs_docs_text(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
image_query = make_image_mm_param(red_image, text="red color")
documents = [
"Describe the red object.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": image_query,
"documents": documents,
},
)
score_response.raise_for_status()
scores = ScoreResponse.model_validate(score_response.json())
assert scores.id is not None
assert scores.data is not None
assert len(scores.data) == 2
assert scores.data[0].score > scores.data[1].score
@pytest.mark.asyncio
async def test_rerank_api_query_text_vs_docs_image(server: RemoteOpenAIServer):
query = "Describe the red object"
red_image = make_base64_image(64, 64, color=(255, 0, 0))
blue_image = make_base64_image(64, 64, color=(0, 0, 255))
documents = [
make_image_mm_param(red_image),
make_image_mm_param(blue_image),
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": MODEL_NAME, "query": query, "documents": documents},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
red_result = next(r for r in rerank.results if r.index == 0)
blue_result = next(r for r in rerank.results if r.index == 1)
assert red_result.relevance_score > blue_result.relevance_score
@pytest.mark.asyncio
async def test_rerank_api_query_text_vs_docs_mix(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
make_image_mm_param(red_image),
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
result0 = next(r for r in rerank.results if r.index == 0)
result1 = next(r for r in rerank.results if r.index == 1)
assert result0.relevance_score > result1.relevance_score
@pytest.mark.asyncio
async def test_rerank_api_query_image_vs_docs_text(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
image_query = make_image_mm_param(red_image, text="red color")
documents = [
"Describe the red object.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": image_query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
result0 = next(r for r in rerank.results if r.index == 0)
result1 = next(r for r in rerank.results if r.index == 1)
assert result0.relevance_score > result1.relevance_score
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
from vllm.entrypoints.pooling.score.utils import (
get_score_prompt,
)
from vllm.inputs import TokensPrompt
from vllm.tokenizers import get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
def assert_prompt_tokenization_consistent(
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert actual_ids == expected_ids, (
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
)
@pytest.fixture(scope="module")
def cross_encoder_model_config():
return ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
@pytest.fixture(scope="module")
def cross_encoder_tokenizer(cross_encoder_model_config):
return get_tokenizer(
CROSS_ENCODER_MODEL_ID,
trust_remote_code=cross_encoder_model_config.trust_remote_code,
)
@pytest.fixture(scope="module")
def llm_reranker_model_config():
"""Model config for LLM-as-reranker style (no pad token)."""
config = ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
# use_sep_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config.hf_config.use_sep_token = False
return config
@pytest.fixture
def tokenization_kwargs():
"""Common tokenization kwargs used across tests."""
return {"add_special_tokens": True, "return_tensors": None}
@pytest.fixture
def mock_model_with_score_template():
"""Mock model class that supports score template and tracks post_process calls."""
class MockModelWithScoreTemplate:
supports_score_template = True
post_process_called: list[TokensPrompt] = []
@staticmethod
def get_score_template(p1: str, p2: str) -> str:
return f"[QUERY]{p1}[SEP][DOC]{p2}"
@staticmethod
def post_process_tokens(prompt: TokensPrompt) -> None:
MockModelWithScoreTemplate.post_process_called.append(prompt)
return MockModelWithScoreTemplate
@pytest.fixture
def mock_model_no_score_template():
"""Mock model class that does not support score template."""
class MockModelNoScoreTemplate:
supports_score_template = False
return MockModelNoScoreTemplate
class TestGetScorePrompt:
"""Tests for the get_score_prompt function."""
def test_tokenization_kwargs_passed_through(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
):
"""Test that tokenization kwargs are properly passed through."""
data_1 = "Query text"
data_2 = "Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs = {
"add_special_tokens": True,
"return_tensors": None,
"truncation": True,
"max_length": 20,
}
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
custom_tokenization_kwargs,
data_1,
data_2,
)
assert isinstance(full_prompt, str)
assert "prompt_token_ids" in engine_prompt
# With max_length=20 and truncation, should not exceed this
assert len(engine_prompt["prompt_token_ids"]) <= 20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert full_ids[: len(actual_ids)] == actual_ids, (
f"Token IDs are not a prefix of full encoding.\n"
f"Full IDs: {full_ids}\n"
f"Actual IDs: {actual_ids}"
)
def test_model_supports_score_template(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template (no score_template arg)."""
with patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query text",
"document text",
)
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
assert "prompt_token_ids" in engine_prompt
assert len(engine_prompt["prompt_token_ids"]) > 0
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_model_supports_score_template_but_custom_template_provided(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template but custom template is provided."""
template = (
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
score_template=template, # Providing a template
)
assert "prompt_token_ids" in engine_prompt
assert full_prompt == "TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_not_using_default_template(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
return_value="test querytest doc",
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"test query",
"test doc",
)
assert full_prompt == "test querytest doc"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_fallback_with_sep_token(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=True."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config, # use_sep_token=True
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert "prompt_token_ids" in engine_prompt
# Should have token_type_ids from text_pair encoding
assert "token_type_ids" in engine_prompt
assert "query" in full_prompt
assert "document" in full_prompt
assert full_prompt != "querydocument"
assert (
engine_prompt["prompt_token_ids"]
== cross_encoder_tokenizer(
"query", text_pair="document", add_special_tokens=True
)["input_ids"]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer,
full_prompt,
engine_prompt,
add_special_tokens=False,
)
def test_fallback_without_sep_token(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=False."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config, # use_sep_token=False
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert full_prompt == "querydocument"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_post_process_tokens_called(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template.post_process_called.clear()
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
)
# post_process_tokens should have been called once
assert len(mock_model_with_score_template.post_process_called) == 1
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from io import BytesIO
import pybase64 as base64
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from PIL import Image
from safetensors.torch import load_file from safetensors.torch import load_file
from transformers import AutoModel, AutoTokenizer from transformers import AutoModel, AutoTokenizer
from tests.conftest import HfRunner from tests.conftest import HfRunner
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
class ColBERTScoringHfRunner(torch.nn.Module): class ColBERTScoringHfRunner(torch.nn.Module):
...@@ -67,3 +76,32 @@ class EncoderScoringHfRunner(HfRunner): ...@@ -67,3 +76,32 @@ class EncoderScoringHfRunner(HfRunner):
for pair in hf_embeddings for pair in hf_embeddings
] ]
return torch.as_tensor(hf_outputs) return torch.as_tensor(hf_outputs)
def make_base64_image(
width: int = 64, height: int = 64, color: tuple[int, int, int] = (255, 0, 0)
) -> str:
"""Create a small solid-color PNG image and return its base64 data URI."""
img = Image.new("RGB", (width, height), color)
buf = BytesIO()
img.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
return f"data:image/png;base64,{b64}"
def make_image_mm_param(
image_uri: str,
text: str | None = None,
) -> ScoreMultiModalParam:
"""Build a ScoreMultiModalParam containing an image (and optional text)."""
content: list = [
ChatCompletionContentPartImageParam(
type="image_url",
image_url={"url": image_uri},
),
]
if text is not None:
content.append(
ChatCompletionContentPartTextParam(type="text", text=text),
)
return ScoreMultiModalParam(content=content)
...@@ -60,7 +60,7 @@ def test_token_ids_prompts(llm: LLM): ...@@ -60,7 +60,7 @@ def test_token_ids_prompts(llm: LLM):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_score_api(llm: LLM): def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1." err_msg = "Scoring API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg): with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False) llm.score("ping", "pong", use_tqdm=False)
......
...@@ -9,7 +9,7 @@ generic ColBERT support works with different encoder architectures. ...@@ -9,7 +9,7 @@ generic ColBERT support works with different encoder architectures.
import pytest import pytest
import torch import torch
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
# ----------------------------------------------------------------------- # -----------------------------------------------------------------------
# Model definitions: (model_name, colbert_dim, extra vllm_runner kwargs) # Model definitions: (model_name, colbert_dim, extra vllm_runner kwargs)
......
...@@ -10,7 +10,7 @@ embeddings for visual document retrieval. ...@@ -10,7 +10,7 @@ embeddings for visual document retrieval.
import pytest import pytest
import torch import torch
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
MODEL_NAME = "ModernVBERT/colmodernvbert-merged" MODEL_NAME = "ModernVBERT/colmodernvbert-merged"
COLBERT_DIM = 128 COLBERT_DIM = 128
......
...@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import ( ...@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
) )
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import VllmRunner from ....conftest import VllmRunner
...@@ -114,7 +114,7 @@ def _run_late_interaction_test( ...@@ -114,7 +114,7 @@ def _run_late_interaction_test(
dtype: str, dtype: str,
) -> None: ) -> None:
"""Verify MaxSim scoring matches manual computation.""" """Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
with vllm_runner( with vllm_runner(
model, model,
......
...@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import ( ...@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
) )
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import VllmRunner from ....conftest import VllmRunner
...@@ -125,7 +125,7 @@ def _run_late_interaction_test( ...@@ -125,7 +125,7 @@ def _run_late_interaction_test(
dtype: str, dtype: str,
) -> None: ) -> None:
"""Verify MaxSim scoring matches manual computation.""" """Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
with vllm_runner( with vllm_runner(
model, model,
......
...@@ -73,7 +73,7 @@ def _run_late_interaction_test( ...@@ -73,7 +73,7 @@ def _run_late_interaction_test(
dtype: str, dtype: str,
) -> None: ) -> None:
"""Verify MaxSim scoring matches manual computation.""" """Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
with vllm_runner( with vllm_runner(
model, model,
......
...@@ -11,7 +11,7 @@ from vllm.entrypoints.chat_utils import ( ...@@ -11,7 +11,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
) )
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
......
...@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import ( ...@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
) )
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
......
...@@ -46,22 +46,16 @@ from vllm.entrypoints.chat_utils import ( ...@@ -46,22 +46,16 @@ from vllm.entrypoints.chat_utils import (
load_chat_template, load_chat_template,
) )
from vllm.entrypoints.pooling.io_processor_factories import init_pooling_io_processors from vllm.entrypoints.pooling.io_processor_factories import init_pooling_io_processors
from vllm.entrypoints.pooling.score.utils import ( from vllm.entrypoints.pooling.scoring.io_processor import (
ScoreData, ScoringIOProcessor,
ScoreMultiModalParam,
_cosine_similarity,
compress_token_type_ids,
compute_maxsim_score,
get_score_prompt,
score_data_to_prompts,
validate_score_input,
) )
from vllm.entrypoints.pooling.scoring.typing import ScoreInput
from vllm.entrypoints.pooling.typing import OfflineInputsContext, OfflineOutputsContext
from vllm.entrypoints.utils import log_non_default_args from vllm.entrypoints.utils import log_non_default_args
from vllm.inputs import ( from vllm.inputs import (
DataPrompt, DataPrompt,
EngineInput, EngineInput,
PromptType, PromptType,
SingletonPrompt,
TextPrompt, TextPrompt,
TokensPrompt, TokensPrompt,
) )
...@@ -1161,7 +1155,9 @@ class LLM: ...@@ -1161,7 +1155,9 @@ class LLM:
if pooling_task in self.pooling_io_processors: if pooling_task in self.pooling_io_processors:
io_processor = self.pooling_io_processors[pooling_task] io_processor = self.pooling_io_processors[pooling_task]
processor_inputs = io_processor.pre_process_offline( processor_inputs = io_processor.pre_process_offline(
prompts_seq, tokenization_kwargs ctx=OfflineInputsContext(
prompts=prompts_seq, tokenization_kwargs=tokenization_kwargs
)
) )
seq_lora_requests = self._lora_request_to_seq( seq_lora_requests = self._lora_request_to_seq(
lora_request, len(prompts_seq) lora_request, len(prompts_seq)
...@@ -1178,7 +1174,9 @@ class LLM: ...@@ -1178,7 +1174,9 @@ class LLM:
outputs = self._run_engine( outputs = self._run_engine(
use_tqdm=use_tqdm, output_type=PoolingRequestOutput use_tqdm=use_tqdm, output_type=PoolingRequestOutput
) )
outputs = io_processor.post_process_offline(outputs) outputs = io_processor.post_process_offline(
ctx=OfflineOutputsContext(outputs=outputs)
)
else: else:
outputs = self._run_completion( outputs = self._run_completion(
prompts=prompts_seq, prompts=prompts_seq,
...@@ -1378,188 +1376,10 @@ class LLM: ...@@ -1378,188 +1376,10 @@ class LLM:
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
) )
def _embedding_score(
self,
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
lora_request: list[LoRARequest] | LoRARequest | None,
tokenization_kwargs: dict[str, Any],
) -> list[ScoringRequestOutput]:
tokenizer = self.get_tokenizer()
input_texts: list[str] = []
for text in data_1 + data_2:
if not isinstance(text, str):
raise NotImplementedError(
"Embedding scores currently do not support multimodal input."
)
input_texts.append(text)
encoded_output = self.encode(
input_texts,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
pooling_task="embed",
tokenization_kwargs=tokenization_kwargs,
)
encoded_output_1 = encoded_output[0 : len(data_1)]
encoded_output_2 = encoded_output[len(data_1) :]
if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
scores = _cosine_similarity(
tokenizer=tokenizer,
embed_1=encoded_output_1,
embed_2=encoded_output_2,
)
return [ScoringRequestOutput.from_base(item) for item in scores]
def _late_interaction_score(
self,
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
lora_request: list[LoRARequest] | LoRARequest | None,
tokenization_kwargs: dict[str, Any],
) -> list[ScoringRequestOutput]:
"""
Late interaction scoring (ColBERT MaxSim).
Encodes queries and documents into per-token embeddings, then computes
MaxSim: sum over query tokens of max similarity to any document token.
"""
from vllm.outputs import PoolingOutput
tokenizer = self.get_tokenizer()
# Convert ScoreData to PromptType (handles both text and multimodal)
model_config = self.model_config
prompts_1 = score_data_to_prompts(data_1, "query", model_config)
prompts_2 = score_data_to_prompts(data_2, "document", model_config)
encoded_output: list[PoolingRequestOutput] = self.encode(
prompts_1 + prompts_2,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
pooling_task="token_embed",
tokenization_kwargs=tokenization_kwargs,
)
encoded_output_1: list[PoolingRequestOutput] = encoded_output[: len(prompts_1)]
encoded_output_2: list[PoolingRequestOutput] = encoded_output[len(prompts_1) :]
if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
# Compute MaxSim scores
scores: list[PoolingRequestOutput] = []
padding: list[int] = []
if (pad_token_id := tokenizer.pad_token_id) is not None:
padding = [pad_token_id]
for emb_1, emb_2 in zip(encoded_output_1, encoded_output_2):
# emb_1.outputs.data: [query_len, dim]
# emb_2.outputs.data: [doc_len, dim]
q_emb = emb_1.outputs.data
d_emb = emb_2.outputs.data
maxsim_score = compute_maxsim_score(q_emb, d_emb)
tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
scores.append(
PoolingRequestOutput(
request_id=f"{emb_1.request_id}_{emb_2.request_id}",
outputs=PoolingOutput(data=maxsim_score),
prompt_token_ids=tokens,
num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
finished=True,
)
)
return [ScoringRequestOutput.from_base(item) for item in scores]
def _cross_encoding_score(
self,
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
lora_request: list[LoRARequest] | LoRARequest | None,
tokenization_kwargs: dict[str, Any],
score_template: str | None,
) -> list[ScoringRequestOutput]:
model_config = self.model_config
tokenizer = self.get_tokenizer()
if is_mistral_tokenizer(tokenizer):
raise ValueError("Score API is not supported for Mistral tokenizer")
if len(data_1) == 1:
data_1 = data_1 * len(data_2)
if pooling_params is None:
pooling_params = PoolingParams(task="classify")
elif pooling_params.task is None:
pooling_params.task = "classify"
pooling_params_list = list[PoolingParams]()
prompts = list[PromptType]()
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
score_template=score_template,
)
if token_type_ids := engine_prompt.pop("token_type_ids", None):
params = pooling_params.clone()
compressed = compress_token_type_ids(token_type_ids)
params.extra_kwargs = {"compressed_token_type_ids": compressed}
pooling_params_list.append(params)
else:
pooling_params_list.append(pooling_params)
prompts.append(engine_prompt)
outputs = self._run_completion(
prompts=prompts,
params=pooling_params_list,
output_type=PoolingRequestOutput,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
return [ScoringRequestOutput.from_base(item) for item in outputs]
def score( def score(
self, self,
data_1: SingletonPrompt data_1: ScoreInput | list[ScoreInput],
| Sequence[SingletonPrompt] data_2: ScoreInput | list[ScoreInput],
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
data_2: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
/, /,
*, *,
use_tqdm: bool | Callable[..., tqdm] = True, use_tqdm: bool | Callable[..., tqdm] = True,
...@@ -1606,84 +1426,72 @@ class LLM: ...@@ -1606,84 +1426,72 @@ class LLM:
A list of `ScoringRequestOutput` objects containing the A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts. generated scores in the same order as the input prompts.
""" """
model_config = self.model_config
runner_type = model_config.runner_type if self.runner_type != "pooling":
if runner_type != "pooling":
raise ValueError( raise ValueError(
"LLM.score() is only supported for pooling models. " "LLM.score() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a " "Try passing `--runner pooling` to use the model as a "
"pooling model." "pooling model."
) )
supported_tasks = self.supported_tasks
score_type = self.model_config.score_type score_type = self.model_config.score_type
is_late_interaction = score_type == "late-interaction" if (
is_cross_encoder = score_type == "cross-encoder" score_type == "cross-encoder"
and getattr(self.model_config.hf_config, "num_labels", 0) != 1
# Late interaction models (e.g., ColBERT) use token_embed for scoring
if not is_late_interaction and all(
t not in supported_tasks for t in ("embed", "classify")
): ):
raise ValueError( raise ValueError("Scoring API is only enabled for num_labels == 1.")
"Score API is not supported by this model. "
"Try converting the model using "
"`--convert embed` or `--convert classify`."
)
if is_cross_encoder and getattr(model_config.hf_config, "num_labels", 0) != 1: if score_type is None or score_type not in self.pooling_io_processors:
raise ValueError("Score API is only enabled for num_labels == 1.") raise ValueError("This model does not support the Scoring API.")
if not is_cross_encoder and chat_template is not None: io_processor = self.pooling_io_processors[score_type]
raise ValueError( assert isinstance(io_processor, ScoringIOProcessor)
"chat_template is only supported for cross-encoder models."
)
is_multimodal_model = model_config.is_multimodal_model pooling_task = io_processor.pooling_task
architecture = model_config.architecture scoring_data = io_processor.valid_inputs(data_1, data_2)
offset = len(scoring_data.data_1)
score_data_1, score_data_2 = validate_score_input( ctx = OfflineInputsContext(
data_1, # type: ignore[arg-type] prompts=scoring_data,
data_2, # type: ignore[arg-type] pooling_params=pooling_params,
is_multimodal_model=is_multimodal_model, tokenization_kwargs=tokenization_kwargs,
architecture=architecture, chat_template=chat_template,
offset=offset,
) )
renderer = self.renderer processor_inputs = io_processor.pre_process_offline(ctx)
tok_params = renderer.default_cmpl_tok_params.with_kwargs(
**(tokenization_kwargs or {})
)
encode_kwargs = tok_params.get_encode_kwargs()
if is_cross_encoder: seq_lora_requests = self._lora_request_to_seq(
return self._cross_encoding_score( lora_request, len(processor_inputs)
score_data_1,
score_data_2,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,
tokenization_kwargs=encode_kwargs,
score_template=chat_template,
) )
elif is_late_interaction:
return self._late_interaction_score( if ctx.pooling_params is None:
score_data_1, ctx.pooling_params = PoolingParams()
score_data_2, params_seq = self._params_to_seq(ctx.pooling_params, len(processor_inputs))
use_tqdm=use_tqdm,
pooling_params=pooling_params, for param in params_seq:
lora_request=lora_request, if param.task is None:
tokenization_kwargs=encode_kwargs, param.task = pooling_task
elif param.task != pooling_task:
msg = f"You cannot overwrite {param.task=!r} with {pooling_task=!r}!"
raise ValueError(msg)
seq_priority = self._priority_to_seq(None, len(processor_inputs))
self._render_and_add_requests(
prompts=processor_inputs,
params=params_seq,
lora_requests=seq_lora_requests,
priorities=seq_priority,
) )
else:
return self._embedding_score( outputs = self._run_engine(use_tqdm=use_tqdm, output_type=PoolingRequestOutput)
score_data_1, outputs = io_processor.post_process_offline(
score_data_2, ctx=OfflineOutputsContext(outputs=outputs, offset=offset),
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,
tokenization_kwargs=encode_kwargs,
) )
return [ScoringRequestOutput.from_base(item) for item in outputs]
def start_profile(self, profile_prefix: str | None = None) -> None: def start_profile(self, profile_prefix: str | None = None) -> None:
"""Start profiling with optional custom trace prefix. """Start profiling with optional custom trace prefix.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment