Unverified Commit d9d21eb8 authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend][3/n] Improve pooling entrypoints | scoring. (#28631)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
parent f09daea2
......@@ -10,9 +10,7 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionStreamResponse,
ChatMessage,
)
from vllm.entrypoints.openai.engine.protocol import (
UsageInfo,
)
from vllm.entrypoints.openai.engine.protocol import UsageInfo
async def accumulate_streaming_response(
......
......@@ -105,7 +105,7 @@ def test_pooling_params(llm: LLM):
@pytest.mark.skip_global_cleanup
def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1."
err_msg = "Scoring API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False)
......
......@@ -390,7 +390,7 @@ async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_score(server: RemoteOpenAIServer, model_name: str):
# score api is only enabled for num_labels == 1.
# Scoring API is only enabled for num_labels == 1.
response = requests.post(
server.url_for("score"),
json={
......@@ -405,7 +405,7 @@ async def test_score(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_rerank(server: RemoteOpenAIServer, model_name: str):
# rerank api is only enabled for num_labels == 1.
# Scoring API is only enabled for num_labels == 1.
response = requests.post(
server.url_for("rerank"),
json={
......
......@@ -7,7 +7,7 @@ import requests
from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-base-en-v1.5"
......
......@@ -8,7 +8,7 @@ import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-reranker-base"
......
......@@ -7,7 +7,7 @@ import pytest
import requests
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
from .util import make_base64_image, make_image_mm_param
MODEL_NAME = "vidore/colpali-v1.3-hf"
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_query_text_vs_docs_image(llm):
"""Score a text query against image documents via the multimodal path."""
red_image = make_base64_image(64, 64, color=(255, 0, 0))
blue_image = make_base64_image(64, 64, color=(0, 0, 255))
query = "Describe the red object"
image_docs = [
make_image_mm_param(red_image),
make_image_mm_param(blue_image),
]
scores = llm.score(query, image_docs)
assert len(scores) == 2
assert scores[0].outputs.score > scores[1].outputs.score
@pytest.mark.skip_global_cleanup
def test_query_text_vs_docs_mix(llm) -> None:
"""Score a text query against a mix of text and image documents."""
red_image = make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
make_image_mm_param(red_image),
]
scores = llm.score(query, documents)
assert len(scores) == 2
assert scores[0].outputs.score > scores[1].outputs.score
@pytest.mark.skip_global_cleanup
def test_query_image_vs_docs_text(llm) -> None:
"""Score an image query against text documents."""
red_image = make_base64_image(64, 64, color=(255, 0, 0))
image_query = make_image_mm_param(red_image, text="red color")
documents = [
"Describe the red object.",
"The capital of France is Paris.",
]
scores = llm.score(image_query, documents)
assert len(scores) == 2
assert scores[0].outputs.score > scores[1].outputs.score
......@@ -6,7 +6,7 @@ import pytest
import requests
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
from .util import ColBERTScoringHfRunner
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
from tests.entrypoints.pooling.scoring.util import (
make_base64_image,
make_image_mm_param,
)
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.scoring.protocol import RerankResponse, ScoreResponse
MODEL_NAME = "vidore/colpali-v1.3-hf"
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, []) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_score_api_query_text_vs_docs_image(server: RemoteOpenAIServer):
query = "Describe the red object"
red_image = make_base64_image(64, 64, color=(255, 0, 0))
blue_image = make_base64_image(64, 64, color=(0, 0, 255))
documents = [
make_image_mm_param(red_image),
make_image_mm_param(blue_image),
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": documents,
},
)
score_response.raise_for_status()
scores = ScoreResponse.model_validate(score_response.json())
assert scores.id is not None
assert scores.data is not None
assert len(scores.data) == 2
assert scores.data[0].score > scores.data[1].score
@pytest.mark.asyncio
async def test_score_api_query_text_vs_docs_mix(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
make_image_mm_param(red_image),
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": documents,
},
)
score_response.raise_for_status()
scores = ScoreResponse.model_validate(score_response.json())
assert scores.id is not None
assert scores.data is not None
assert len(scores.data) == 2
assert scores.data[0].score > scores.data[1].score
@pytest.mark.asyncio
async def test_score_api_query_image_vs_docs_text(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
image_query = make_image_mm_param(red_image, text="red color")
documents = [
"Describe the red object.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": image_query,
"documents": documents,
},
)
score_response.raise_for_status()
scores = ScoreResponse.model_validate(score_response.json())
assert scores.id is not None
assert scores.data is not None
assert len(scores.data) == 2
assert scores.data[0].score > scores.data[1].score
@pytest.mark.asyncio
async def test_rerank_api_query_text_vs_docs_image(server: RemoteOpenAIServer):
query = "Describe the red object"
red_image = make_base64_image(64, 64, color=(255, 0, 0))
blue_image = make_base64_image(64, 64, color=(0, 0, 255))
documents = [
make_image_mm_param(red_image),
make_image_mm_param(blue_image),
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": MODEL_NAME, "query": query, "documents": documents},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
red_result = next(r for r in rerank.results if r.index == 0)
blue_result = next(r for r in rerank.results if r.index == 1)
assert red_result.relevance_score > blue_result.relevance_score
@pytest.mark.asyncio
async def test_rerank_api_query_text_vs_docs_mix(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
make_image_mm_param(red_image),
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
result0 = next(r for r in rerank.results if r.index == 0)
result1 = next(r for r in rerank.results if r.index == 1)
assert result0.relevance_score > result1.relevance_score
@pytest.mark.asyncio
async def test_rerank_api_query_image_vs_docs_text(server: RemoteOpenAIServer):
red_image = make_base64_image(64, 64, color=(255, 0, 0))
image_query = make_image_mm_param(red_image, text="red color")
documents = [
"Describe the red object.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": image_query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
result0 = next(r for r in rerank.results if r.index == 0)
result1 = next(r for r in rerank.results if r.index == 1)
assert result0.relevance_score > result1.relevance_score
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
from vllm.entrypoints.pooling.score.utils import (
get_score_prompt,
)
from vllm.inputs import TokensPrompt
from vllm.tokenizers import get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
def assert_prompt_tokenization_consistent(
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert actual_ids == expected_ids, (
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
)
@pytest.fixture(scope="module")
def cross_encoder_model_config():
return ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
@pytest.fixture(scope="module")
def cross_encoder_tokenizer(cross_encoder_model_config):
return get_tokenizer(
CROSS_ENCODER_MODEL_ID,
trust_remote_code=cross_encoder_model_config.trust_remote_code,
)
@pytest.fixture(scope="module")
def llm_reranker_model_config():
"""Model config for LLM-as-reranker style (no pad token)."""
config = ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
# use_sep_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config.hf_config.use_sep_token = False
return config
@pytest.fixture
def tokenization_kwargs():
"""Common tokenization kwargs used across tests."""
return {"add_special_tokens": True, "return_tensors": None}
@pytest.fixture
def mock_model_with_score_template():
"""Mock model class that supports score template and tracks post_process calls."""
class MockModelWithScoreTemplate:
supports_score_template = True
post_process_called: list[TokensPrompt] = []
@staticmethod
def get_score_template(p1: str, p2: str) -> str:
return f"[QUERY]{p1}[SEP][DOC]{p2}"
@staticmethod
def post_process_tokens(prompt: TokensPrompt) -> None:
MockModelWithScoreTemplate.post_process_called.append(prompt)
return MockModelWithScoreTemplate
@pytest.fixture
def mock_model_no_score_template():
"""Mock model class that does not support score template."""
class MockModelNoScoreTemplate:
supports_score_template = False
return MockModelNoScoreTemplate
class TestGetScorePrompt:
"""Tests for the get_score_prompt function."""
def test_tokenization_kwargs_passed_through(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
):
"""Test that tokenization kwargs are properly passed through."""
data_1 = "Query text"
data_2 = "Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs = {
"add_special_tokens": True,
"return_tensors": None,
"truncation": True,
"max_length": 20,
}
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
custom_tokenization_kwargs,
data_1,
data_2,
)
assert isinstance(full_prompt, str)
assert "prompt_token_ids" in engine_prompt
# With max_length=20 and truncation, should not exceed this
assert len(engine_prompt["prompt_token_ids"]) <= 20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert full_ids[: len(actual_ids)] == actual_ids, (
f"Token IDs are not a prefix of full encoding.\n"
f"Full IDs: {full_ids}\n"
f"Actual IDs: {actual_ids}"
)
def test_model_supports_score_template(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template (no score_template arg)."""
with patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query text",
"document text",
)
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
assert "prompt_token_ids" in engine_prompt
assert len(engine_prompt["prompt_token_ids"]) > 0
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_model_supports_score_template_but_custom_template_provided(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template but custom template is provided."""
template = (
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
score_template=template, # Providing a template
)
assert "prompt_token_ids" in engine_prompt
assert full_prompt == "TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_not_using_default_template(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
return_value="test querytest doc",
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"test query",
"test doc",
)
assert full_prompt == "test querytest doc"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_fallback_with_sep_token(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=True."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config, # use_sep_token=True
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert "prompt_token_ids" in engine_prompt
# Should have token_type_ids from text_pair encoding
assert "token_type_ids" in engine_prompt
assert "query" in full_prompt
assert "document" in full_prompt
assert full_prompt != "querydocument"
assert (
engine_prompt["prompt_token_ids"]
== cross_encoder_tokenizer(
"query", text_pair="document", add_special_tokens=True
)["input_ids"]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer,
full_prompt,
engine_prompt,
add_special_tokens=False,
)
def test_fallback_without_sep_token(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=False."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config, # use_sep_token=False
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert full_prompt == "querydocument"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_post_process_tokens_called(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template.post_process_called.clear()
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
patch(
"vllm.entrypoints.pooling.score.utils.safe_apply_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
)
# post_process_tokens should have been called once
assert len(mock_model_with_score_template.post_process_called) == 1
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from io import BytesIO
import pybase64 as base64
import torch
import torch.nn.functional as F
from huggingface_hub import hf_hub_download
from PIL import Image
from safetensors.torch import load_file
from transformers import AutoModel, AutoTokenizer
from tests.conftest import HfRunner
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
class ColBERTScoringHfRunner(torch.nn.Module):
......@@ -67,3 +76,32 @@ class EncoderScoringHfRunner(HfRunner):
for pair in hf_embeddings
]
return torch.as_tensor(hf_outputs)
def make_base64_image(
width: int = 64, height: int = 64, color: tuple[int, int, int] = (255, 0, 0)
) -> str:
"""Create a small solid-color PNG image and return its base64 data URI."""
img = Image.new("RGB", (width, height), color)
buf = BytesIO()
img.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
return f"data:image/png;base64,{b64}"
def make_image_mm_param(
image_uri: str,
text: str | None = None,
) -> ScoreMultiModalParam:
"""Build a ScoreMultiModalParam containing an image (and optional text)."""
content: list = [
ChatCompletionContentPartImageParam(
type="image_url",
image_url={"url": image_uri},
),
]
if text is not None:
content.append(
ChatCompletionContentPartTextParam(type="text", text=text),
)
return ScoreMultiModalParam(content=content)
......@@ -60,7 +60,7 @@ def test_token_ids_prompts(llm: LLM):
@pytest.mark.skip_global_cleanup
def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1."
err_msg = "Scoring API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False)
......
......@@ -9,7 +9,7 @@ generic ColBERT support works with different encoder architectures.
import pytest
import torch
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
# -----------------------------------------------------------------------
# Model definitions: (model_name, colbert_dim, extra vllm_runner kwargs)
......
......@@ -10,7 +10,7 @@ embeddings for visual document retrieval.
import pytest
import torch
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
MODEL_NAME = "ModernVBERT/colmodernvbert-merged"
COLBERT_DIM = 128
......
......@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import VllmRunner
......@@ -114,7 +114,7 @@ def _run_late_interaction_test(
dtype: str,
) -> None:
"""Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
with vllm_runner(
model,
......
......@@ -18,7 +18,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import VllmRunner
......@@ -125,7 +125,7 @@ def _run_late_interaction_test(
dtype: str,
) -> None:
"""Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
with vllm_runner(
model,
......
......@@ -73,7 +73,7 @@ def _run_late_interaction_test(
dtype: str,
) -> None:
"""Verify MaxSim scoring matches manual computation."""
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
from vllm.entrypoints.pooling.scoring.utils import compute_maxsim_score
with vllm_runner(
model,
......
......@@ -11,7 +11,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner
......
......@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
......
......@@ -46,22 +46,16 @@ from vllm.entrypoints.chat_utils import (
load_chat_template,
)
from vllm.entrypoints.pooling.io_processor_factories import init_pooling_io_processors
from vllm.entrypoints.pooling.score.utils import (
ScoreData,
ScoreMultiModalParam,
_cosine_similarity,
compress_token_type_ids,
compute_maxsim_score,
get_score_prompt,
score_data_to_prompts,
validate_score_input,
from vllm.entrypoints.pooling.scoring.io_processor import (
ScoringIOProcessor,
)
from vllm.entrypoints.pooling.scoring.typing import ScoreInput
from vllm.entrypoints.pooling.typing import OfflineInputsContext, OfflineOutputsContext
from vllm.entrypoints.utils import log_non_default_args
from vllm.inputs import (
DataPrompt,
EngineInput,
PromptType,
SingletonPrompt,
TextPrompt,
TokensPrompt,
)
......@@ -1161,7 +1155,9 @@ class LLM:
if pooling_task in self.pooling_io_processors:
io_processor = self.pooling_io_processors[pooling_task]
processor_inputs = io_processor.pre_process_offline(
prompts_seq, tokenization_kwargs
ctx=OfflineInputsContext(
prompts=prompts_seq, tokenization_kwargs=tokenization_kwargs
)
)
seq_lora_requests = self._lora_request_to_seq(
lora_request, len(prompts_seq)
......@@ -1178,7 +1174,9 @@ class LLM:
outputs = self._run_engine(
use_tqdm=use_tqdm, output_type=PoolingRequestOutput
)
outputs = io_processor.post_process_offline(outputs)
outputs = io_processor.post_process_offline(
ctx=OfflineOutputsContext(outputs=outputs)
)
else:
outputs = self._run_completion(
prompts=prompts_seq,
......@@ -1378,188 +1376,10 @@ class LLM:
tokenization_kwargs=tokenization_kwargs,
)
def _embedding_score(
self,
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
lora_request: list[LoRARequest] | LoRARequest | None,
tokenization_kwargs: dict[str, Any],
) -> list[ScoringRequestOutput]:
tokenizer = self.get_tokenizer()
input_texts: list[str] = []
for text in data_1 + data_2:
if not isinstance(text, str):
raise NotImplementedError(
"Embedding scores currently do not support multimodal input."
)
input_texts.append(text)
encoded_output = self.encode(
input_texts,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
pooling_task="embed",
tokenization_kwargs=tokenization_kwargs,
)
encoded_output_1 = encoded_output[0 : len(data_1)]
encoded_output_2 = encoded_output[len(data_1) :]
if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
scores = _cosine_similarity(
tokenizer=tokenizer,
embed_1=encoded_output_1,
embed_2=encoded_output_2,
)
return [ScoringRequestOutput.from_base(item) for item in scores]
def _late_interaction_score(
self,
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
lora_request: list[LoRARequest] | LoRARequest | None,
tokenization_kwargs: dict[str, Any],
) -> list[ScoringRequestOutput]:
"""
Late interaction scoring (ColBERT MaxSim).
Encodes queries and documents into per-token embeddings, then computes
MaxSim: sum over query tokens of max similarity to any document token.
"""
from vllm.outputs import PoolingOutput
tokenizer = self.get_tokenizer()
# Convert ScoreData to PromptType (handles both text and multimodal)
model_config = self.model_config
prompts_1 = score_data_to_prompts(data_1, "query", model_config)
prompts_2 = score_data_to_prompts(data_2, "document", model_config)
encoded_output: list[PoolingRequestOutput] = self.encode(
prompts_1 + prompts_2,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
pooling_task="token_embed",
tokenization_kwargs=tokenization_kwargs,
)
encoded_output_1: list[PoolingRequestOutput] = encoded_output[: len(prompts_1)]
encoded_output_2: list[PoolingRequestOutput] = encoded_output[len(prompts_1) :]
if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
# Compute MaxSim scores
scores: list[PoolingRequestOutput] = []
padding: list[int] = []
if (pad_token_id := tokenizer.pad_token_id) is not None:
padding = [pad_token_id]
for emb_1, emb_2 in zip(encoded_output_1, encoded_output_2):
# emb_1.outputs.data: [query_len, dim]
# emb_2.outputs.data: [doc_len, dim]
q_emb = emb_1.outputs.data
d_emb = emb_2.outputs.data
maxsim_score = compute_maxsim_score(q_emb, d_emb)
tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
scores.append(
PoolingRequestOutput(
request_id=f"{emb_1.request_id}_{emb_2.request_id}",
outputs=PoolingOutput(data=maxsim_score),
prompt_token_ids=tokens,
num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens,
finished=True,
)
)
return [ScoringRequestOutput.from_base(item) for item in scores]
def _cross_encoding_score(
self,
data_1: list[ScoreData],
data_2: list[ScoreData],
*,
use_tqdm: bool | Callable[..., tqdm],
pooling_params: PoolingParams | None,
lora_request: list[LoRARequest] | LoRARequest | None,
tokenization_kwargs: dict[str, Any],
score_template: str | None,
) -> list[ScoringRequestOutput]:
model_config = self.model_config
tokenizer = self.get_tokenizer()
if is_mistral_tokenizer(tokenizer):
raise ValueError("Score API is not supported for Mistral tokenizer")
if len(data_1) == 1:
data_1 = data_1 * len(data_2)
if pooling_params is None:
pooling_params = PoolingParams(task="classify")
elif pooling_params.task is None:
pooling_params.task = "classify"
pooling_params_list = list[PoolingParams]()
prompts = list[PromptType]()
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
score_template=score_template,
)
if token_type_ids := engine_prompt.pop("token_type_ids", None):
params = pooling_params.clone()
compressed = compress_token_type_ids(token_type_ids)
params.extra_kwargs = {"compressed_token_type_ids": compressed}
pooling_params_list.append(params)
else:
pooling_params_list.append(pooling_params)
prompts.append(engine_prompt)
outputs = self._run_completion(
prompts=prompts,
params=pooling_params_list,
output_type=PoolingRequestOutput,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
return [ScoringRequestOutput.from_base(item) for item in outputs]
def score(
self,
data_1: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
data_2: SingletonPrompt
| Sequence[SingletonPrompt]
| ScoreMultiModalParam
| list[ScoreMultiModalParam],
data_1: ScoreInput | list[ScoreInput],
data_2: ScoreInput | list[ScoreInput],
/,
*,
use_tqdm: bool | Callable[..., tqdm] = True,
......@@ -1606,84 +1426,72 @@ class LLM:
A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts.
"""
model_config = self.model_config
runner_type = model_config.runner_type
if runner_type != "pooling":
if self.runner_type != "pooling":
raise ValueError(
"LLM.score() is only supported for pooling models. "
"Try passing `--runner pooling` to use the model as a "
"pooling model."
)
supported_tasks = self.supported_tasks
score_type = self.model_config.score_type
is_late_interaction = score_type == "late-interaction"
is_cross_encoder = score_type == "cross-encoder"
# Late interaction models (e.g., ColBERT) use token_embed for scoring
if not is_late_interaction and all(
t not in supported_tasks for t in ("embed", "classify")
if (
score_type == "cross-encoder"
and getattr(self.model_config.hf_config, "num_labels", 0) != 1
):
raise ValueError(
"Score API is not supported by this model. "
"Try converting the model using "
"`--convert embed` or `--convert classify`."
)
raise ValueError("Scoring API is only enabled for num_labels == 1.")
if is_cross_encoder and getattr(model_config.hf_config, "num_labels", 0) != 1:
raise ValueError("Score API is only enabled for num_labels == 1.")
if score_type is None or score_type not in self.pooling_io_processors:
raise ValueError("This model does not support the Scoring API.")
if not is_cross_encoder and chat_template is not None:
raise ValueError(
"chat_template is only supported for cross-encoder models."
)
io_processor = self.pooling_io_processors[score_type]
assert isinstance(io_processor, ScoringIOProcessor)
is_multimodal_model = model_config.is_multimodal_model
architecture = model_config.architecture
pooling_task = io_processor.pooling_task
scoring_data = io_processor.valid_inputs(data_1, data_2)
offset = len(scoring_data.data_1)
score_data_1, score_data_2 = validate_score_input(
data_1, # type: ignore[arg-type]
data_2, # type: ignore[arg-type]
is_multimodal_model=is_multimodal_model,
architecture=architecture,
ctx = OfflineInputsContext(
prompts=scoring_data,
pooling_params=pooling_params,
tokenization_kwargs=tokenization_kwargs,
chat_template=chat_template,
offset=offset,
)
renderer = self.renderer
tok_params = renderer.default_cmpl_tok_params.with_kwargs(
**(tokenization_kwargs or {})
)
encode_kwargs = tok_params.get_encode_kwargs()
processor_inputs = io_processor.pre_process_offline(ctx)
if is_cross_encoder:
return self._cross_encoding_score(
score_data_1,
score_data_2,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,
tokenization_kwargs=encode_kwargs,
score_template=chat_template,
seq_lora_requests = self._lora_request_to_seq(
lora_request, len(processor_inputs)
)
elif is_late_interaction:
return self._late_interaction_score(
score_data_1,
score_data_2,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,
tokenization_kwargs=encode_kwargs,
if ctx.pooling_params is None:
ctx.pooling_params = PoolingParams()
params_seq = self._params_to_seq(ctx.pooling_params, len(processor_inputs))
for param in params_seq:
if param.task is None:
param.task = pooling_task
elif param.task != pooling_task:
msg = f"You cannot overwrite {param.task=!r} with {pooling_task=!r}!"
raise ValueError(msg)
seq_priority = self._priority_to_seq(None, len(processor_inputs))
self._render_and_add_requests(
prompts=processor_inputs,
params=params_seq,
lora_requests=seq_lora_requests,
priorities=seq_priority,
)
else:
return self._embedding_score(
score_data_1,
score_data_2,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
lora_request=lora_request,
tokenization_kwargs=encode_kwargs,
outputs = self._run_engine(use_tqdm=use_tqdm, output_type=PoolingRequestOutput)
outputs = io_processor.post_process_offline(
ctx=OfflineOutputsContext(outputs=outputs, offset=offset),
)
return [ScoringRequestOutput.from_base(item) for item in outputs]
def start_profile(self, profile_prefix: str | None = None) -> None:
"""Start profiling with optional custom trace prefix.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment