Unverified Commit 911d38ed authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Model] Let more models to support the score template. (#31335)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
parent caaa482a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from typing import Any
import mteb
import numpy as np
import pytest
import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test
from .mteb_score_utils import mteb_test_rerank_models
from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"],
......@@ -21,51 +25,71 @@ RERANK_MODELS = [
RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
chat_template_name="qwen3_reranker.jinja",
pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
mteb_score=0.33459,
enable_test=True,
),
RerankModelInfo(
"Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
chat_template_name="qwen3_reranker.jinja",
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False,
),
]
class Qwen3RerankerHfRunner(HfRunner):
class Qwen3RerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
self.max_length = 40960
@torch.no_grad
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
tokenizer = self.tokenizer
prompts = []
for query, document in zip(queries, corpus):
conversation = [
{"role": "query", "content": query},
{"role": "document", "content": document},
]
prompt = tokenizer.apply_chat_template(
conversation=conversation,
tools=None,
chat_template=self.chat_template,
tokenize=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@torch.no_grad()
prompts.append(prompt)
def compute_logits(inputs):
batch_scores = self.model(**inputs).logits[:, -1, :]
true_vector = batch_scores[:, self.token_true_id]
......@@ -76,9 +100,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return scores
scores = []
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = process_inputs(pairs)
for prompt in prompts:
inputs = tokenizer([prompt], return_tensors="pt")
inputs = self.wrap_device(inputs)
score = compute_logits(inputs)
scores.append(score[0].item())
return torch.Tensor(scores)
......@@ -86,7 +110,7 @@ class Qwen3RerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
mteb_test_rerank_models(vllm_runner, model_info, hf_runner=Qwen3RerankerHfRunner)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
......@@ -99,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None
}
mteb_test_rerank_models(
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
vllm_runner,
model_info,
vllm_extra_kwargs=vllm_extra_kwargs,
hf_runner=Qwen3RerankerHfRunner,
)
......@@ -251,7 +251,7 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
tokens = getattr(config, "classifier_from_token", None)
assert tokens is not None and len(tokens) == 2, (
"Try loading the original Qwen3 Reranker?, see: "
"https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py"
"https://github.com/vllm-project/vllm/tree/main/examples/pooling/score/qwen3_reranker_offline.py"
)
model_config.hf_config.method = "from_2_way_softmax"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment