Unverified Commit bf73a3e4 authored by twj's avatar twj Committed by GitHub
Browse files

[Bugfix][Frontend] Fix Jina reranker multimodal input compatibility (#31445)


Signed-off-by: default avatartianwenjing <tianwenjing@jfgenius.com>
Signed-off-by: default avatartwj <151701930+twjww@users.noreply.github.com>
Co-authored-by: default avatartianwenjing <tianwenjing@jfgenius.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 3ecfdc37
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import cast
import pytest import pytest
from transformers import AutoModel from transformers import AutoModel
from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.score_utils import ScoreMultiModalParam from vllm.entrypoints.score_utils import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
model_name = "jinaai/jina-reranker-m0" MODELS = ["jinaai/jina-reranker-m0"]
mm_processor_kwargs = { MM_PROCESSOR_KWARGS = {
"min_pixels": 3136, "min_pixels": 3136,
"max_pixels": 602112, "max_pixels": 602112,
} }
limit_mm_per_prompt = {"image": 2} LIMIT_MM_PER_PROMPT = {"image": 2}
CHECKPOINT_TO_HF_MAPPER = {
"visual.": "model.visual.",
"model.": "model.language_model.",
}
# Shared long text for test data
LONG_TEXT_DOC = """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""" # noqa: E501
# Test data for different scenarios
TEXT_IMAGE_TEST_DATA = {
"query": [{"text": "slm markdown"}],
"documents": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
],
}
TEXT_TEXT_TEST_DATA = {
"query": [{"text": "slm markdown"}],
"documents": [
{"text": LONG_TEXT_DOC},
{"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
],
}
IMAGE_TEXT_TEST_DATA = {
"query": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}
],
"documents": [
{"text": LONG_TEXT_DOC},
{"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
],
}
IMAGE_IMAGE_TEST_DATA = {
"query": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}
],
"documents": [
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
],
}
def vllm_reranker( TEXT_MIXED_DOCS_TEST_DATA = {
"query": [{"text": "slm markdown"}],
"documents": [
{"text": LONG_TEXT_DOC},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
{"text": "数据提取么?为什么不用正则啊,你用正则不就全解决了么?"},
{
"image": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
],
}
def _normalize_image(image_val: str) -> str:
"""Normalize image value to proper format for HF model."""
return (
image_val
if image_val.startswith(("http://", "https://"))
else f"data:image/png;base64,{image_val}"
)
def create_score_multimodal_param(
content_parts: list[dict],
) -> ScoreMultiModalParam:
"""
Create a ScoreMultiModalParam from a list of content dictionaries.
Each dict supports the following formats:
- Text: {'text': 'content'}
- Image URL: {'image': 'https://...'}
- Image Base64: {'image': 'base64_str'}
"""
formatted_content = []
for part in content_parts:
if "text" in part:
formatted_content.append(
ChatCompletionContentPartTextParam(
type="text",
text=part["text"],
)
)
elif "image" in part:
image_val = part["image"]
if image_val.startswith(("http://", "https://")):
formatted_content.append(
ChatCompletionContentPartImageParam(
type="image_url",
image_url={"url": image_val},
)
)
else:
formatted_content.append(
ChatCompletionContentPartImageEmbedsParam(
type="image_embeds", image_embeds=image_val
)
)
return ScoreMultiModalParam(content=formatted_content)
def _run_vllm(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
model_name: str, model: str,
dtype: str, dtype: str,
query_strs: list[str], query_strs: list[dict[str, str]],
document_strs: list[str], document_strs: list[dict[str, str]],
query_type: str = "text", ) -> list[float]:
doc_type: str = "text", """Run vLLM reranker and return scores."""
): query = create_score_multimodal_param(query_strs)
def create_image_param(url: str) -> ChatCompletionContentPartImageParam: documents = create_score_multimodal_param(document_strs)
return {"type": "image_url", "image_url": {"url": f"{url}"}}
query: list[str] | ScoreMultiModalParam
if query_type == "text":
query = query_strs
elif query_type == "image":
query = ScoreMultiModalParam(
content=[create_image_param(url) for url in query_strs]
)
documents: list[str] | ScoreMultiModalParam
if doc_type == "text":
documents = document_strs
elif doc_type == "image":
documents = ScoreMultiModalParam(
content=[create_image_param(url) for url in document_strs]
)
with vllm_runner( with vllm_runner(
model_name, model,
runner="pooling", runner="pooling",
dtype=dtype, dtype=dtype,
max_num_seqs=2, max_num_seqs=2,
max_model_len=2048, max_model_len=2048,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=MM_PROCESSOR_KWARGS,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model: ) as vllm_model:
outputs = vllm_model.llm.score(query, documents) outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs] return [output.outputs.score for output in outputs]
def hf_reranker( def _run_hf(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
model_name: str, model: str,
dtype: str, dtype: str,
query_strs: list[str], query_strs: list[dict[str, str]],
document_strs: list[str], document_strs: list[dict[str, str]],
query_type: str = "text", ) -> list[float]:
doc_type: str = "text", """Run HuggingFace reranker and return scores."""
): query = query_strs[0]
checkpoint_to_hf_mapper = { if "text" in query:
"visual.": "model.visual.", query_type = "text"
"model.": "model.language_model.", query_data = query["text"]
} elif "image" in query:
query_type = "image"
data_pairs = [[query_strs[0], d] for d in document_strs] query_data = _normalize_image(query["image"])
else:
raise ValueError("Unsupported query format")
# Separate documents by type
text_docs: list[str] = []
image_docs: list[str] = []
text_indices: list[int] = []
image_indices: list[int] = []
for idx, doc in enumerate(document_strs):
if "text" in doc:
text_docs.append(doc["text"])
text_indices.append(idx)
elif "image" in doc:
image_docs.append(_normalize_image(doc["image"]))
image_indices.append(idx)
else:
raise ValueError(f"Unsupported document format at index {idx}")
scores: list[None | float] = [None] * len(document_strs)
with hf_runner( with hf_runner(
model_name, model,
dtype=dtype, dtype=dtype,
trust_remote_code=True, trust_remote_code=True,
auto_cls=AutoModel, auto_cls=AutoModel,
model_kwargs={"key_mapping": checkpoint_to_hf_mapper}, model_kwargs={"key_mapping": CHECKPOINT_TO_HF_MAPPER},
) as hf_model: ) as hf_model:
return hf_model.model.compute_score( # Score text documents
data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type if text_docs:
) text_scores = hf_model.model.compute_score(
[[query_data, d] for d in text_docs],
max_length=2048,
query_type=query_type,
doc_type="text",
)
for i, s in zip(text_indices, text_scores):
scores[i] = s
# Score image documents
if image_docs:
image_scores = hf_model.model.compute_score(
[[query_data, d] for d in image_docs],
max_length=2048,
query_type=query_type,
doc_type="image",
)
for i, s in zip(image_indices, image_scores):
scores[i] = s
# Visual Documents Reranking assert all(s is not None for s in scores)
@pytest.mark.parametrize("model_name", [model_name]) return cast(list[float], scores)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
query = ["slm markdown"]
documents = [
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "image"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "image"
)
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
model: str,
dtype: str,
query_strs: list[dict[str, str]],
document_strs: list[dict[str, str]],
) -> None:
"""Run comparison test between vLLM and HuggingFace implementations."""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# Textual Documents Reranking vllm_outputs = _run_vllm(vllm_runner, model, dtype, query_strs, document_strs)
@pytest.mark.parametrize("model_name", [model_name]) hf_outputs = _run_hf(hf_runner, model, dtype, query_strs, document_strs)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_text(hf_runner, vllm_runner, model_name, dtype): # Compare outputs
query = ["slm markdown"] assert len(hf_outputs) == len(vllm_outputs), (
documents = [ f"Output length mismatch: HF={len(hf_outputs)}, vLLM={len(vllm_outputs)}"
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "text", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "text", "text"
) )
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02) for i, (hf_score, vllm_score) in enumerate(zip(hf_outputs, vllm_outputs)):
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) assert hf_score == pytest.approx(vllm_score, rel=0.02), (
f"Score mismatch at index {i}: HF={hf_score}, vLLM={vllm_score}"
)
# Image Querying for Textual Documents @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_model_image_text(hf_runner, vllm_runner, model_name, dtype): def test_model_text_image(
query = [ hf_runner,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" vllm_runner,
] model: str,
documents = [ dtype: str,
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient ) -> None:
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML """Visual Documents Reranking"""
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding _run_test(
large language models. The models effectiveness results from two key innovations: (1) a three-stage hf_runner,
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, vllm_runner,
refining, and critiquing web content extraction; and (2) a unified training framework combining model,
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that dtype,
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated TEXT_IMAGE_TEST_DATA["query"],
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly TEXT_IMAGE_TEST_DATA["documents"],
lower computational requirements.""", # noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
]
hf_outputs = hf_reranker(
hf_runner, model_name, dtype, query, documents, "image", "text"
)
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "text"
) )
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_text(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Textual Documents Reranking"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
TEXT_TEXT_TEST_DATA["query"],
TEXT_TEXT_TEST_DATA["documents"],
)
# Image Querying for Image Documents @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("model_name", [model_name])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_model_image_image(hf_runner, vllm_runner, model_name, dtype): def test_model_image_text(
query = [ hf_runner,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" vllm_runner,
] model: str,
documents = [ dtype: str,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png", ) -> None:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png", """Image Querying for Textual Documents"""
] _run_test(
hf_runner,
hf_outputs = hf_reranker( vllm_runner,
hf_runner, model_name, dtype, query, documents, "image", "image" model,
dtype,
IMAGE_TEXT_TEST_DATA["query"],
IMAGE_TEXT_TEST_DATA["documents"],
) )
vllm_outputs = vllm_reranker(
vllm_runner, model_name, dtype, query, documents, "image", "image"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_image_image(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Image Querying for Image Documents"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
IMAGE_IMAGE_TEST_DATA["query"],
IMAGE_IMAGE_TEST_DATA["documents"],
) )
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_text_mixed_documents(
hf_runner,
vllm_runner,
model: str,
dtype: str,
) -> None:
"""Text Query for Mixed Text and Image Documents"""
_run_test(
hf_runner,
vllm_runner,
model,
dtype,
TEXT_MIXED_DOCS_TEST_DATA["query"],
TEXT_MIXED_DOCS_TEST_DATA["documents"],
)
...@@ -24,7 +24,9 @@ from vllm.outputs import PoolingRequestOutput ...@@ -24,7 +24,9 @@ from vllm.outputs import PoolingRequestOutput
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
ScoreContentPartParam: TypeAlias = ( ScoreContentPartParam: TypeAlias = (
ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam ChatCompletionContentPartImageParam
| ChatCompletionContentPartImageEmbedsParam
| ChatCompletionContentPartTextParam
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment