Unverified Commit 0f465ab5 authored by Gabriel Marinho's avatar Gabriel Marinho Committed by GitHub
Browse files

[FEATURE] Enables offline /score for embedding models (#12021)


Signed-off-by: default avatarGabriel Marinho <gmarinho@ibm.com>
parent 23a7cbc8
...@@ -5,12 +5,18 @@ Run `pytest tests/models/embedding/language/test_scoring.py`. ...@@ -5,12 +5,18 @@ Run `pytest tests/models/embedding/language/test_scoring.py`.
import math import math
import pytest import pytest
import torch
import torch.nn.functional as F
MODELS = [ MODELS = [
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
"BAAI/bge-reranker-v2-m3", # Roberta "BAAI/bge-reranker-v2-m3", # Roberta
] ]
EMBEDDING_MODELS = [
"sentence-transformers/all-MiniLM-L12-v2",
]
TEXTS_1 = [ TEXTS_1 = [
"What is the capital of France?", "What is the capital of France?",
"What is the capital of Germany?", "What is the capital of Germany?",
...@@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): ...@@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
def emb_model_name(request):
yield request.param
@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
]
with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"])
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
...@@ -5,6 +5,7 @@ from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence, ...@@ -5,6 +5,7 @@ from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
Tuple, Type, Union, cast, overload) Tuple, Type, Union, cast, overload)
import cloudpickle import cloudpickle
import torch
import torch.nn as nn import torch.nn as nn
from tqdm import tqdm from tqdm import tqdm
from typing_extensions import TypeVar, deprecated from typing_extensions import TypeVar, deprecated
...@@ -996,6 +997,107 @@ class LLM: ...@@ -996,6 +997,107 @@ class LLM:
return [ClassificationRequestOutput.from_base(item) for item in items] return [ClassificationRequestOutput.from_base(item) for item in items]
def _embedding_score(
self,
tokenizer: AnyTokenizer,
text_1: List[Union[str, TextPrompt, TokensPrompt]],
text_2: List[Union[str, TextPrompt, TokensPrompt]],
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: bool = True,
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> List[ScoringRequestOutput]:
encoded_output = self.encode(
text_1 + text_2,
use_tqdm=use_tqdm,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request)
encoded_output_1 = encoded_output[0:len(text_1)]
encoded_output_2 = encoded_output[len(text_1):]
if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
output_pairs = [(t1, t2)
for t1, t2 in zip(encoded_output_1, encoded_output_2)]
scores = []
scorer = torch.nn.CosineSimilarity(0)
for embed_1, embed_2 in output_pairs:
pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
if (pad_token_id := getattr(tokenizer, "pad_token_id",
None)) is not None:
tokens = embed_1.prompt_token_ids + [
pad_token_id
] + embed_2.prompt_token_ids
else:
tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
scores.append(
PoolingRequestOutput(
request_id=f"{embed_1.request_id}_{embed_2.request_id}",
outputs=pair_score,
prompt_token_ids=tokens,
finished=True))
items = self.engine_class.validate_outputs(scores,
PoolingRequestOutput)
return [ScoringRequestOutput.from_base(item) for item in items]
def _cross_encoding_score(
self,
tokenizer: Union[AnyTokenizer],
text_1: List[Union[str, TextPrompt, TokensPrompt]],
text_2: List[Union[str, TextPrompt, TokensPrompt]],
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: bool = True,
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> List[ScoringRequestOutput]:
if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
"Score API is only enabled for `--task embed or score`")
if len(text_1) == 1:
text_1 = text_1 * len(text_2)
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
pooling_params = PoolingParams()
tokenization_kwargs: Dict[str, Any] = {}
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens
parsed_prompts = []
for q, t in input_pairs:
prompt_inputs = tokenizer(text=q,
text_pair=t,
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
)
outputs = self._run_engine(use_tqdm=use_tqdm)
items = self.engine_class.validate_outputs(outputs,
PoolingRequestOutput)
return [ScoringRequestOutput.from_base(item) for item in items]
def score( def score(
self, self,
text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
...@@ -1047,25 +1149,20 @@ class LLM: ...@@ -1047,25 +1149,20 @@ class LLM:
raise ValueError(" ".join(messages)) raise ValueError(" ".join(messages))
if not self.llm_engine.model_config.is_cross_encoder: if self.llm_engine.model_config.task not in ("embed", "score"):
raise ValueError("Your model does not support cross encoding")
if self.llm_engine.model_config.task != "score":
raise ValueError("Score API is only enabled for `--task score`")
tokenizer = self.llm_engine.get_tokenizer()
if isinstance(tokenizer, MistralTokenizer):
raise ValueError( raise ValueError(
"MistralTokenizer not supported for cross-encoding") "Score API is only enabled for `--task embed or --task score`")
# the tokenizer for models such as # the tokenizer for models such as
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
# lists of tokens to the `text` and `text_pair` kwargs # lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.llm_engine.get_tokenizer()
def ensure_str(prompt: SingletonPrompt): def ensure_str(prompt: SingletonPrompt):
if isinstance(prompt, dict): if isinstance(prompt, dict):
if "multi_modal_data" in prompt: if "multi_modal_data" in prompt:
raise ValueError("Multi-modal prompt is not " raise ValueError("Multi-modal prompt is not "
"supported for cross encoding") "supported for scoring")
elif "prompt_token_ids" in prompt: elif "prompt_token_ids" in prompt:
prompt = tokenizer.decode( prompt = tokenizer.decode(
cast(TokensPrompt, prompt)["prompt_token_ids"]) cast(TokensPrompt, prompt)["prompt_token_ids"])
...@@ -1091,40 +1188,15 @@ class LLM: ...@@ -1091,40 +1188,15 @@ class LLM:
if len(text_2) == 0: if len(text_2) == 0:
raise ValueError("At least one text_pair element must be given") raise ValueError("At least one text_pair element must be given")
if len(text_1) == 1: if self.llm_engine.model_config.is_cross_encoder:
text_1 = text_1 * len(text_2) return self._cross_encoding_score(tokenizer, text_1, text_2,
truncate_prompt_tokens, use_tqdm,
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] lora_request,
pooling_params = PoolingParams() prompt_adapter_request)
else:
tokenization_kwargs: Dict[str, Any] = {} return self._embedding_score(tokenizer, text_1, text_2,
if truncate_prompt_tokens is not None: truncate_prompt_tokens, use_tqdm,
tokenization_kwargs["truncation"] = True lora_request, prompt_adapter_request)
tokenization_kwargs["max_length"] = truncate_prompt_tokens
parsed_prompts = []
for q, t in input_pairs:
prompt_inputs = tokenizer(text=q,
text_pair=t,
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
)
outputs = self._run_engine(use_tqdm=use_tqdm)
items = self.engine_class.validate_outputs(outputs,
PoolingRequestOutput)
return [ScoringRequestOutput.from_base(item) for item in items]
def start_profile(self) -> None: def start_profile(self) -> None:
self.llm_engine.start_profile() self.llm_engine.start_profile()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment