Commit 24db6dab authored by Rayyyyy's avatar Rayyyyy
Browse files

first add

parents
Pipeline #850 failed with stages
in 0 seconds
from typing import List, Iterable
import collections
import string
import os
import json
from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
class WhitespaceTokenizer(WordTokenizer):
"""
Simple and fast white-space tokenizer. Splits sentence based on white spaces.
Punctuation are stripped from tokens.
"""
def __init__(
self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False
):
self.stop_words = set(stop_words)
self.do_lower_case = do_lower_case
self.set_vocab(vocab)
def get_vocab(self):
return self.vocab
def set_vocab(self, vocab: Iterable[str]):
self.vocab = vocab
self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
def tokenize(self, text: str, **kwargs) -> List[int]:
if self.do_lower_case:
text = text.lower()
tokens = text.split()
tokens_filtered = []
for token in tokens:
if token in self.stop_words:
continue
elif token in self.word2idx:
tokens_filtered.append(self.word2idx[token])
continue
token = token.strip(string.punctuation)
if token in self.stop_words:
continue
elif len(token) > 0 and token in self.word2idx:
tokens_filtered.append(self.word2idx[token])
continue
token = token.lower()
if token in self.stop_words:
continue
elif token in self.word2idx:
tokens_filtered.append(self.word2idx[token])
continue
return tokens_filtered
def save(self, output_path: str):
with open(os.path.join(output_path, "whitespacetokenizer_config.json"), "w") as fOut:
json.dump(
{
"vocab": list(self.word2idx.keys()),
"stop_words": list(self.stop_words),
"do_lower_case": self.do_lower_case,
},
fOut,
)
@staticmethod
def load(input_path: str):
with open(os.path.join(input_path, "whitespacetokenizer_config.json"), "r") as fIn:
config = json.load(fIn)
return WhitespaceTokenizer(**config)
from abc import ABC, abstractmethod
from typing import List, Iterable
ENGLISH_STOP_WORDS = [
"!",
'"',
"''",
"``",
"#",
"$",
"%",
"&",
"'",
"(",
")",
"*",
"+",
",",
"-",
".",
"/",
":",
";",
"<",
"=",
">",
"?",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"|",
"}",
"~",
"a",
"about",
"above",
"across",
"after",
"afterwards",
"again",
"against",
"ain",
"all",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"amoungst",
"amount",
"an",
"and",
"another",
"any",
"anyhow",
"anyone",
"anything",
"anyway",
"anywhere",
"are",
"aren",
"around",
"as",
"at",
"back",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"behind",
"being",
"below",
"beside",
"besides",
"between",
"beyond",
"bill",
"both",
"bottom",
"but",
"by",
"call",
"can",
"cannot",
"cant",
"co",
"con",
"could",
"couldn",
"couldnt",
"cry",
"d",
"de",
"describe",
"detail",
"did",
"didn",
"do",
"does",
"doesn",
"doing",
"don",
"done",
"down",
"due",
"during",
"each",
"eg",
"eight",
"either",
"eleven",
"else",
"elsewhere",
"empty",
"enough",
"etc",
"even",
"ever",
"every",
"everyone",
"everything",
"everywhere",
"except",
"few",
"fifteen",
"fifty",
"fill",
"find",
"fire",
"first",
"five",
"for",
"former",
"formerly",
"forty",
"found",
"four",
"from",
"front",
"full",
"further",
"get",
"give",
"go",
"had",
"hadn",
"has",
"hasn",
"hasnt",
"have",
"haven",
"having",
"he",
"hence",
"her",
"here",
"hereafter",
"hereby",
"herein",
"hereupon",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"however",
"hundred",
"i",
"ie",
"if",
"in",
"inc",
"indeed",
"interest",
"into",
"is",
"isn",
"it",
"its",
"itself",
"just",
"keep",
"last",
"latter",
"latterly",
"least",
"less",
"ll",
"ltd",
"m",
"ma",
"made",
"many",
"may",
"me",
"meanwhile",
"might",
"mightn",
"mill",
"mine",
"more",
"moreover",
"most",
"mostly",
"move",
"much",
"must",
"mustn",
"my",
"myself",
"name",
"namely",
"needn",
"neither",
"never",
"nevertheless",
"next",
"nine",
"no",
"nobody",
"none",
"noone",
"nor",
"not",
"nothing",
"now",
"nowhere",
"o",
"of",
"off",
"often",
"on",
"once",
"one",
"only",
"onto",
"or",
"other",
"others",
"otherwise",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"part",
"per",
"perhaps",
"please",
"put",
"rather",
"re",
"s",
"same",
"see",
"seem",
"seemed",
"seeming",
"seems",
"serious",
"several",
"shan",
"she",
"should",
"shouldn",
"show",
"side",
"since",
"sincere",
"six",
"sixty",
"so",
"some",
"somehow",
"someone",
"something",
"sometime",
"sometimes",
"somewhere",
"still",
"such",
"system",
"t",
"take",
"ten",
"than",
"that",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"therefore",
"therein",
"thereupon",
"these",
"they",
"thick",
"thin",
"third",
"this",
"those",
"though",
"three",
"through",
"throughout",
"thru",
"thus",
"to",
"together",
"too",
"top",
"toward",
"towards",
"twelve",
"twenty",
"two",
"un",
"under",
"until",
"up",
"upon",
"us",
"ve",
"very",
"via",
"was",
"wasn",
"we",
"well",
"were",
"weren",
"what",
"whatever",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whither",
"who",
"whoever",
"whole",
"whom",
"whose",
"why",
"will",
"with",
"within",
"without",
"won",
"would",
"wouldn",
"y",
"yet",
"you",
"your",
"yours",
"yourself",
"yourselves",
]
class WordTokenizer(ABC):
@abstractmethod
def set_vocab(self, vocab: Iterable[str]):
pass
@abstractmethod
def get_vocab(self, vocab: Iterable[str]):
pass
@abstractmethod
def tokenize(self, text: str, **kwargs) -> List[int]:
pass
@abstractmethod
def save(self, output_path: str):
pass
@staticmethod
@abstractmethod
def load(input_path: str):
pass
from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
from .WhitespaceTokenizer import WhitespaceTokenizer
from .PhraseTokenizer import PhraseTokenizer
__all__ = ["WordTokenizer", "WhitespaceTokenizer", "PhraseTokenizer", "ENGLISH_STOP_WORDS"]
import time
from torch import Tensor
from typing import List, Literal, Tuple, TYPE_CHECKING
import numpy as np
import logging
from typing import Dict, Optional, Union
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
import faiss
import usearch
def semantic_search_faiss(
query_embeddings: np.ndarray,
corpus_embeddings: Optional[np.ndarray] = None,
corpus_index: Optional["faiss.Index"] = None,
corpus_precision: Literal["float32", "uint8", "ubinary"] = "float32",
top_k: int = 10,
ranges: Optional[np.ndarray] = None,
calibration_embeddings: Optional[np.ndarray] = None,
rescore: bool = True,
rescore_multiplier: int = 2,
exact: bool = True,
output_index: bool = False,
) -> Tuple[List[List[Dict[str, Union[int, float]]]], float, "faiss.Index"]:
"""
Performs semantic search using the FAISS library.
Rescoring will be performed if:
1. `rescore` is True
2. The query embeddings are not quantized
3. The corpus is quantized, i.e. the corpus precision is not float32
Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
keep `top_k`.
:param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
:param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
:param corpus_index: FAISS index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both.
:param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
Default is "float32".
:param top_k: Number of top results to retrieve. Default is 10.
:param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
Default is None, which means that the ranges will be calculated from the calibration embeddings.
:param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
values for each dimension. Default is None, which means that the ranges will be calculated from the query
embeddings. This is not recommended.
:param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
:param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
and then rescore to only keep `top_k`. Default is 2.
:param exact: Whether to use exact search or approximate search. Default is True.
:param output_index: Whether to output the FAISS index used for the search. Default is False.
:return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
the tuple will also contain the FAISS index used for the search.
:raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
The time taken for the search is a float value.
"""
import faiss
if corpus_embeddings is not None and corpus_index is not None:
raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.")
if corpus_embeddings is None and corpus_index is None:
raise ValueError("Either corpus_embeddings or corpus_index should be used.")
# If corpus_index is not provided, create a new index
if corpus_index is None:
if corpus_precision in ("float32", "uint8"):
if exact:
corpus_index = faiss.IndexFlatIP(corpus_embeddings.shape[1])
else:
corpus_index = faiss.IndexHNSWFlat(corpus_embeddings.shape[1], 16)
elif corpus_precision == "ubinary":
if exact:
corpus_index = faiss.IndexBinaryFlat(corpus_embeddings.shape[1] * 8)
else:
corpus_index = faiss.IndexBinaryHNSW(corpus_embeddings.shape[1] * 8, 16)
corpus_index.add(corpus_embeddings)
# If rescoring is enabled and the query embeddings are in float32, we need to quantize them
# to the same precision as the corpus embeddings. Also update the top_k value to account for the
# rescore_multiplier
rescore_embeddings = None
k = top_k
if query_embeddings.dtype not in (np.uint8, np.int8):
if rescore:
if corpus_precision != "float32":
rescore_embeddings = query_embeddings
k *= rescore_multiplier
else:
logger.warning(
"Rescoring is enabled but the corpus is not quantized. Either pass `rescore=False` or "
'quantize the corpus embeddings with `quantize_embeddings(embeddings, precision="...") `'
'and pass `corpus_precision="..."` to `semantic_search_faiss`.'
)
query_embeddings = quantize_embeddings(
query_embeddings,
precision=corpus_precision,
ranges=ranges,
calibration_embeddings=calibration_embeddings,
)
elif rescore:
logger.warning(
"Rescoring is enabled but the query embeddings are quantized. Either pass `rescore=False` or don't quantize the query embeddings."
)
# Perform the search using the usearch index
start_t = time.time()
scores, indices = corpus_index.search(query_embeddings, k)
# If rescoring is enabled, we need to rescore the results using the rescore_embeddings
if rescore_embeddings is not None:
top_k_embeddings = np.array(
[[corpus_index.reconstruct(idx.item()) for idx in query_indices] for query_indices in indices]
)
# If the corpus precision is binary, we need to unpack the bits
if corpus_precision == "ubinary":
top_k_embeddings = np.unpackbits(top_k_embeddings, axis=-1).astype(int)
else:
top_k_embeddings = top_k_embeddings.astype(int)
# rescore_embeddings: [num_queries, embedding_dim]
# top_k_embeddings: [num_queries, top_k, embedding_dim]
# updated_scores: [num_queries, top_k]
# We use einsum to calculate the dot product between the query and the top_k embeddings, equivalent to looping
# over the queries and calculating 'rescore_embeddings[i] @ top_k_embeddings[i].T'
rescored_scores = np.einsum("ij,ikj->ik", rescore_embeddings, top_k_embeddings)
rescored_indices = np.argsort(-rescored_scores)[:, :top_k]
indices = indices[np.arange(len(query_embeddings))[:, None], rescored_indices]
scores = rescored_scores[np.arange(len(query_embeddings))[:, None], rescored_indices]
delta_t = time.time() - start_t
outputs = (
[
[
{"corpus_id": int(neighbor), "score": float(score)}
for score, neighbor in zip(scores[query_id], indices[query_id])
]
for query_id in range(len(query_embeddings))
],
delta_t,
)
if output_index:
outputs = (*outputs, corpus_index)
return outputs
def semantic_search_usearch(
query_embeddings: np.ndarray,
corpus_embeddings: Optional[np.ndarray] = None,
corpus_index: Optional["usearch.index.Index"] = None,
corpus_precision: Literal["float32", "int8", "binary"] = "float32",
top_k: int = 10,
ranges: Optional[np.ndarray] = None,
calibration_embeddings: Optional[np.ndarray] = None,
rescore: bool = True,
rescore_multiplier: int = 2,
exact: bool = True,
output_index: bool = False,
) -> Tuple[List[List[Dict[str, Union[int, float]]]], float, "usearch.index.Index"]:
"""
Performs semantic search using the usearch library.
Rescoring will be performed if:
1. `rescore` is True
2. The query embeddings are not quantized
3. The corpus is quantized, i.e. the corpus precision is not float32
Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
keep `top_k`.
:param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
:param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
:param corpus_index: usearch index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both.
:param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
Default is "float32".
:param top_k: Number of top results to retrieve. Default is 10.
:param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
Default is None, which means that the ranges will be calculated from the calibration embeddings.
:param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
values for each dimension. Default is None, which means that the ranges will be calculated from the query
embeddings. This is not recommended.
:param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
:param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
and then rescore to only keep `top_k`. Default is 2.
:param exact: Whether to use exact search or approximate search. Default is True.
:param output_index: Whether to output the usearch index used for the search. Default is False.
:return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
the tuple will also contain the usearch index used for the search.
:raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
The time taken for the search is a float value.
"""
from usearch.index import Index
from usearch.compiled import ScalarKind
if corpus_embeddings is not None and corpus_index is not None:
raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.")
if corpus_embeddings is None and corpus_index is None:
raise ValueError("Either corpus_embeddings or corpus_index should be used.")
if corpus_precision not in ["float32", "int8", "binary"]:
raise ValueError('corpus_precision must be "float32", "int8", or "binary" for usearch')
# If corpus_index is not provided, create a new index
if corpus_index is None:
if corpus_precision == "float32":
corpus_index = Index(
ndim=corpus_embeddings.shape[1],
metric="cos",
dtype="f32",
)
elif corpus_precision == "int8":
corpus_index = Index(
ndim=corpus_embeddings.shape[1],
metric="ip",
dtype="i8",
)
elif corpus_precision == "binary":
corpus_index = Index(
ndim=corpus_embeddings.shape[1],
metric="hamming",
dtype="i8",
)
corpus_index.add(np.arange(len(corpus_embeddings)), corpus_embeddings)
# If rescoring is enabled and the query embeddings are in float32, we need to quantize them
# to the same precision as the corpus embeddings. Also update the top_k value to account for the
# rescore_multiplier
rescore_embeddings = None
k = top_k
if query_embeddings.dtype not in (np.uint8, np.int8):
if rescore:
if corpus_index.dtype != ScalarKind.F32:
rescore_embeddings = query_embeddings
k *= rescore_multiplier
else:
logger.warning(
"Rescoring is enabled but the corpus is not quantized. Either pass `rescore=False` or "
'quantize the corpus embeddings with `quantize_embeddings(embeddings, precision="...") `'
'and pass `corpus_precision="..."` to `semantic_search_usearch`.'
)
query_embeddings = quantize_embeddings(
query_embeddings,
precision=corpus_precision,
ranges=ranges,
calibration_embeddings=calibration_embeddings,
)
elif rescore:
logger.warning(
"Rescoring is enabled but the query embeddings are quantized. Either pass `rescore=False` or don't quantize the query embeddings."
)
# Perform the search using the usearch index
start_t = time.time()
matches = corpus_index.search(query_embeddings, count=k, exact=exact)
scores = matches.distances
indices = matches.keys
if scores.ndim < 2:
scores = np.atleast_2d(scores)
if indices.ndim < 2:
indices = np.atleast_2d(indices)
# If rescoring is enabled, we need to rescore the results using the rescore_embeddings
if rescore_embeddings is not None:
top_k_embeddings = np.array([corpus_index.get(query_indices) for query_indices in indices])
# If the corpus precision is binary, we need to unpack the bits
if corpus_precision == "binary":
top_k_embeddings = np.unpackbits(top_k_embeddings.astype(np.uint8), axis=-1)
top_k_embeddings = top_k_embeddings.astype(int)
# rescore_embeddings: [num_queries, embedding_dim]
# top_k_embeddings: [num_queries, top_k, embedding_dim]
# updated_scores: [num_queries, top_k]
# We use einsum to calculate the dot product between the query and the top_k embeddings, equivalent to looping
# over the queries and calculating 'rescore_embeddings[i] @ top_k_embeddings[i].T'
rescored_scores = np.einsum("ij,ikj->ik", rescore_embeddings, top_k_embeddings)
rescored_indices = np.argsort(-rescored_scores)[:, :top_k]
indices = indices[np.arange(len(query_embeddings))[:, None], rescored_indices]
scores = rescored_scores[np.arange(len(query_embeddings))[:, None], rescored_indices]
delta_t = time.time() - start_t
outputs = (
[
[
{"corpus_id": int(neighbor), "score": float(score)}
for score, neighbor in zip(scores[query_id], indices[query_id])
]
for query_id in range(len(query_embeddings))
],
delta_t,
)
if output_index:
outputs = (*outputs, corpus_index)
return outputs
def quantize_embeddings(
embeddings: Union[Tensor, np.ndarray],
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"],
ranges: Optional[np.ndarray] = None,
calibration_embeddings: Optional[np.ndarray] = None,
) -> np.ndarray:
"""
Quantizes embeddings to a lower precision. This can be used to reduce the memory footprint and increase the
speed of similarity search. The supported precisions are "float32", "int8", "uint8", "binary", and "ubinary".
:param embeddings: Unquantized (e.g. float) embeddings with to quantize to a given precision
:param precision: The precision to convert to. Options are "float32", "int8", "uint8", "binary", "ubinary".
:param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
Default is None, which means that the ranges will be calculated from the calibration embeddings.
:type ranges: Optional[np.ndarray]
:param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
values for each dimension. Default is None, which means that the ranges will be calculated from the query
embeddings. This is not recommended.
:type calibration_embeddings: Optional[np.ndarray]
:return: Quantized embeddings with the specified precision
"""
if isinstance(embeddings, Tensor):
embeddings = embeddings.cpu().numpy()
elif isinstance(embeddings, list):
if isinstance(embeddings[0], Tensor):
embeddings = [embedding.cpu().numpy() for embedding in embeddings]
embeddings = np.array(embeddings)
if embeddings.dtype in (np.uint8, np.int8):
raise Exception("Embeddings to quantize must be float rather than int8 or uint8.")
if precision == "float32":
return embeddings.astype(np.float32)
if precision.endswith("int8"):
# Either use the 1. provided ranges, 2. the calibration dataset or 3. the provided embeddings
if ranges is None:
if calibration_embeddings is not None:
ranges = np.vstack((np.min(calibration_embeddings, axis=0), np.max(calibration_embeddings, axis=0)))
else:
if embeddings.shape[0] < 100:
logger.warning(
f"Computing {precision} quantization buckets based on {len(embeddings)} embedding{'s' if len(embeddings) != 1 else ''}."
f" {precision} quantization is more stable with `ranges` calculated from more embeddings "
"or a `calibration_embeddings` that can be used to calculate the buckets."
)
ranges = np.vstack((np.min(embeddings, axis=0), np.max(embeddings, axis=0)))
starts = ranges[0, :]
steps = (ranges[1, :] - ranges[0, :]) / 255
if precision == "uint8":
return ((embeddings - starts) / steps).astype(np.uint8)
elif precision == "int8":
return ((embeddings - starts) / steps - 128).astype(np.int8)
if precision == "binary":
return (np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) - 128).astype(np.int8)
if precision == "ubinary":
return np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
raise ValueError(f"Precision {precision} is not supported")
from typing import Union, List
class InputExample:
"""
Structure for one input example with texts, the label and a unique id
"""
def __init__(self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0):
"""
Creates one InputExample with the given texts, guid and label
:param guid
id for the example
:param texts
the texts for the example.
:param label
the label for the example
"""
self.guid = guid
self.texts = texts
self.label = label
def __str__(self):
return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))
from . import InputExample
import os
class LabelSentenceReader:
"""Reads in a file that has at least two columns: a label and a sentence.
This reader can for example be used with the BatchHardTripletLoss.
Maps labels automatically to integers"""
def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator="\t"):
self.folder = folder
self.label_map = {}
self.label_col_idx = label_col_idx
self.sentence_col_idx = sentence_col_idx
self.separator = separator
def get_examples(self, filename, max_examples=0):
examples = []
id = 0
for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
splits = line.strip().split(self.separator)
label = splits[self.label_col_idx]
sentence = splits[self.sentence_col_idx]
if label not in self.label_map:
self.label_map[label] = len(self.label_map)
label_id = self.label_map[label]
guid = "%s-%d" % (filename, id)
id += 1
examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
if 0 < max_examples <= id:
break
return examples
from . import InputExample
import gzip
import os
class NLIDataReader(object):
"""
Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
"""
def __init__(self, dataset_folder):
self.dataset_folder = dataset_folder
def get_examples(self, filename, max_examples=0):
"""
data_splits specified which data split to use (train, dev, test).
Expects that self.dataset_folder contains the files s1.$data_split.gz, s2.$data_split.gz,
labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
"""
s1 = gzip.open(os.path.join(self.dataset_folder, "s1." + filename), mode="rt", encoding="utf-8").readlines()
s2 = gzip.open(os.path.join(self.dataset_folder, "s2." + filename), mode="rt", encoding="utf-8").readlines()
labels = gzip.open(
os.path.join(self.dataset_folder, "labels." + filename), mode="rt", encoding="utf-8"
).readlines()
examples = []
id = 0
for sentence_a, sentence_b, label in zip(s1, s2, labels):
guid = "%s-%d" % (filename, id)
id += 1
examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
if 0 < max_examples <= len(examples):
break
return examples
@staticmethod
def get_labels():
return {"contradiction": 0, "entailment": 1, "neutral": 2}
def get_num_labels(self):
return len(self.get_labels())
def map_label(self, label):
return self.get_labels()[label.strip().lower()]
from . import InputExample
import gzip
class PairedFilesReader(object):
"""
Reads in the a Pair Dataset, split in two files
"""
def __init__(self, filepaths):
self.filepaths = filepaths
def get_examples(self, max_examples=0):
""" """
fIns = []
for filepath in self.filepaths:
fIn = (
gzip.open(filepath, "rt", encoding="utf-8")
if filepath.endswith(".gz")
else open(filepath, encoding="utf-8")
)
fIns.append(fIn)
examples = []
eof = False
while not eof:
texts = []
for fIn in fIns:
text = fIn.readline()
if text == "":
eof = True
break
texts.append(text)
if eof:
break
examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
if max_examples > 0 and len(examples) >= max_examples:
break
return examples
from . import InputExample
import csv
import gzip
import os
class STSDataReader:
"""
Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
Default values expects a tab separated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
"""
def __init__(
self,
dataset_folder,
s1_col_idx=0,
s2_col_idx=1,
score_col_idx=2,
delimiter="\t",
quoting=csv.QUOTE_NONE,
normalize_scores=True,
min_score=0,
max_score=5,
):
self.dataset_folder = dataset_folder
self.score_col_idx = score_col_idx
self.s1_col_idx = s1_col_idx
self.s2_col_idx = s2_col_idx
self.delimiter = delimiter
self.quoting = quoting
self.normalize_scores = normalize_scores
self.min_score = min_score
self.max_score = max_score
def get_examples(self, filename, max_examples=0):
"""
filename specified which data split to use (train.csv, dev.csv, test.csv).
"""
filepath = os.path.join(self.dataset_folder, filename)
with gzip.open(filepath, "rt", encoding="utf8") if filename.endswith(".gz") else open(
filepath, encoding="utf-8"
) as fIn:
data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
examples = []
for id, row in enumerate(data):
score = float(row[self.score_col_idx])
if self.normalize_scores: # Normalize to a 0...1 value
score = (score - self.min_score) / (self.max_score - self.min_score)
s1 = row[self.s1_col_idx]
s2 = row[self.s2_col_idx]
examples.append(InputExample(guid=filename + str(id), texts=[s1, s2], label=score))
if max_examples > 0 and len(examples) >= max_examples:
break
return examples
class STSBenchmarkDataReader(STSDataReader):
"""
Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
Scores are normalized from 0...5 to 0...1
"""
def __init__(
self,
dataset_folder,
s1_col_idx=5,
s2_col_idx=6,
score_col_idx=4,
delimiter="\t",
quoting=csv.QUOTE_NONE,
normalize_scores=True,
min_score=0,
max_score=5,
):
super().__init__(
dataset_folder=dataset_folder,
s1_col_idx=s1_col_idx,
s2_col_idx=s2_col_idx,
score_col_idx=score_col_idx,
delimiter=delimiter,
quoting=quoting,
normalize_scores=normalize_scores,
min_score=min_score,
max_score=max_score,
)
from . import InputExample
import csv
import os
class TripletReader(object):
"""
Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
one positive example (s2) and one negative example (s3)
"""
def __init__(
self,
dataset_folder,
s1_col_idx=0,
s2_col_idx=1,
s3_col_idx=2,
has_header=False,
delimiter="\t",
quoting=csv.QUOTE_NONE,
):
self.dataset_folder = dataset_folder
self.s1_col_idx = s1_col_idx
self.s2_col_idx = s2_col_idx
self.s3_col_idx = s3_col_idx
self.has_header = has_header
self.delimiter = delimiter
self.quoting = quoting
def get_examples(self, filename, max_examples=0):
""" """
data = csv.reader(
open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
delimiter=self.delimiter,
quoting=self.quoting,
)
examples = []
if self.has_header:
next(data)
for id, row in enumerate(data):
s1 = row[self.s1_col_idx]
s2 = row[self.s2_col_idx]
s3 = row[self.s3_col_idx]
examples.append(InputExample(texts=[s1, s2, s3]))
if max_examples > 0 and len(examples) >= max_examples:
break
return examples
from .InputExample import InputExample
from .LabelSentenceReader import LabelSentenceReader
from .NLIDataReader import NLIDataReader
from .STSDataReader import STSDataReader, STSBenchmarkDataReader
from .TripletReader import TripletReader
__all__ = [
"InputExample",
"LabelSentenceReader",
"NLIDataReader",
"STSDataReader",
"STSBenchmarkDataReader",
"TripletReader",
]
import functools
import requests
from torch import Tensor, device
from typing import List, Callable, Literal
from tqdm.autonotebook import tqdm
import sys
import importlib
import os
import torch
import numpy as np
import queue
import logging
from typing import Dict, Optional, Union, overload
from transformers import is_torch_npu_available
from huggingface_hub import snapshot_download, hf_hub_download
import heapq
logger = logging.getLogger(__name__)
def pytorch_cos_sim(a: Tensor, b: Tensor) -> Tensor:
"""
Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
:return: Matrix with res[i][j] = cos_sim(a[i], b[j])
"""
return cos_sim(a, b)
def cos_sim(a: Tensor, b: Tensor) -> Tensor:
"""
Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
:return: Matrix with res[i][j] = cos_sim(a[i], b[j])
"""
if not isinstance(a, torch.Tensor):
a = torch.tensor(a)
if not isinstance(b, torch.Tensor):
b = torch.tensor(b)
if len(a.shape) == 1:
a = a.unsqueeze(0)
if len(b.shape) == 1:
b = b.unsqueeze(0)
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
def dot_score(a: Tensor, b: Tensor) -> Tensor:
"""
Computes the dot-product dot_prod(a[i], b[j]) for all i and j.
:return: Matrix with res[i][j] = dot_prod(a[i], b[j])
"""
if not isinstance(a, torch.Tensor):
a = torch.tensor(a)
if not isinstance(b, torch.Tensor):
b = torch.tensor(b)
if len(a.shape) == 1:
a = a.unsqueeze(0)
if len(b.shape) == 1:
b = b.unsqueeze(0)
return torch.mm(a, b.transpose(0, 1))
def pairwise_dot_score(a: Tensor, b: Tensor) -> Tensor:
"""
Computes the pairwise dot-product dot_prod(a[i], b[i])
:return: Vector with res[i] = dot_prod(a[i], b[i])
"""
if not isinstance(a, torch.Tensor):
a = torch.tensor(a)
if not isinstance(b, torch.Tensor):
b = torch.tensor(b)
return (a * b).sum(dim=-1)
def pairwise_cos_sim(a: Tensor, b: Tensor) -> Tensor:
"""
Computes the pairwise cossim cos_sim(a[i], b[i])
:return: Vector with res[i] = cos_sim(a[i], b[i])
"""
if not isinstance(a, torch.Tensor):
a = torch.tensor(a)
if not isinstance(b, torch.Tensor):
b = torch.tensor(b)
return pairwise_dot_score(normalize_embeddings(a), normalize_embeddings(b))
def pairwise_angle_sim(x: Tensor, y: Tensor) -> Tensor:
"""
Computes the absolute normalized angle distance;
see AnglELoss or https://arxiv.org/abs/2309.12871v1
for more information.
:return: Vector with res[i] = angle_sim(a[i], b[i])
"""
if not isinstance(x, torch.Tensor):
x = torch.tensor(x)
if not isinstance(y, torch.Tensor):
y = torch.tensor(y)
# modified from https://github.com/SeanLee97/AnglE/blob/main/angle_emb/angle.py
# chunk both tensors to obtain complex components
a, b = torch.chunk(x, 2, dim=1)
c, d = torch.chunk(y, 2, dim=1)
z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
re = (a * c + b * d) / z
im = (b * c - a * d) / z
dz = torch.sum(a**2 + b**2, dim=1, keepdim=True) ** 0.5
dw = torch.sum(c**2 + d**2, dim=1, keepdim=True) ** 0.5
re /= dz / dw
im /= dz / dw
norm_angle = torch.sum(torch.concat((re, im), dim=1), dim=1)
return torch.abs(norm_angle)
def normalize_embeddings(embeddings: Tensor) -> Tensor:
"""
Normalizes the embeddings matrix, so that each sentence embedding has unit length
"""
return torch.nn.functional.normalize(embeddings, p=2, dim=1)
@overload
def truncate_embeddings(embeddings: np.ndarray, truncate_dim: Optional[int]) -> np.ndarray: ...
@overload
def truncate_embeddings(embeddings: torch.Tensor, truncate_dim: Optional[int]) -> torch.Tensor: ...
def truncate_embeddings(
embeddings: Union[np.ndarray, torch.Tensor], truncate_dim: Optional[int]
) -> Union[np.ndarray, torch.Tensor]:
"""
:param embeddings: Embeddings to truncate.
:param truncate_dim: The dimension to truncate sentence embeddings to. `None` does no truncation.
:return: Truncated embeddings.
"""
return embeddings[..., :truncate_dim]
def paraphrase_mining(
model, sentences: List[str], show_progress_bar: bool = False, batch_size: int = 32, *args, **kwargs
) -> List[List[Union[float, int]]]:
"""
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
other sentences and returns a list with the pairs that have the highest cosine similarity score.
:param model: SentenceTransformer model for embedding computation
:param sentences: A list of strings (texts or sentences)
:param show_progress_bar: Plotting of a progress bar
:param batch_size: Number of texts that are encoded simultaneously by the model
:param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
:param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
:param max_pairs: Maximal number of text pairs returned.
:param top_k: For each sentence, we retrieve up to top_k other sentences
:param score_function: Function for computing scores. By default, cosine similarity.
:return: Returns a list of triplets with the format [score, id1, id2]
"""
# Compute embedding for the sentences
embeddings = model.encode(
sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True
)
return paraphrase_mining_embeddings(embeddings, *args, **kwargs)
def paraphrase_mining_embeddings(
embeddings: Tensor,
query_chunk_size: int = 5000,
corpus_chunk_size: int = 100000,
max_pairs: int = 500000,
top_k: int = 100,
score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
) -> List[List[Union[float, int]]]:
"""
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
other sentences and returns a list with the pairs that have the highest cosine similarity score.
:param embeddings: A tensor with the embeddings
:param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
:param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
:param max_pairs: Maximal number of text pairs returned.
:param top_k: For each sentence, we retrieve up to top_k other sentences
:param score_function: Function for computing scores. By default, cosine similarity.
:return: Returns a list of triplets with the format [score, id1, id2]
"""
top_k += 1 # A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs
# Mine for duplicates
pairs = queue.PriorityQueue()
min_score = -1
num_added = 0
for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
for query_start_idx in range(0, len(embeddings), query_chunk_size):
scores = score_function(
embeddings[query_start_idx : query_start_idx + query_chunk_size],
embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
)
scores_top_k_values, scores_top_k_idx = torch.topk(
scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False
)
scores_top_k_values = scores_top_k_values.cpu().tolist()
scores_top_k_idx = scores_top_k_idx.cpu().tolist()
for query_itr in range(len(scores)):
for top_k_idx, corpus_itr in enumerate(scores_top_k_idx[query_itr]):
i = query_start_idx + query_itr
j = corpus_start_idx + corpus_itr
if i != j and scores_top_k_values[query_itr][top_k_idx] > min_score:
pairs.put((scores_top_k_values[query_itr][top_k_idx], i, j))
num_added += 1
if num_added >= max_pairs:
entry = pairs.get()
min_score = entry[0]
# Get the pairs
added_pairs = set() # Used for duplicate detection
pairs_list = []
while not pairs.empty():
score, i, j = pairs.get()
sorted_i, sorted_j = sorted([i, j])
if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
added_pairs.add((sorted_i, sorted_j))
pairs_list.append([score, sorted_i, sorted_j])
# Highest scores first
pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
return pairs_list
def information_retrieval(*args, **kwargs) -> List[List[Dict[str, Union[int, float]]]]:
"""This function is deprecated. Use semantic_search instead"""
return semantic_search(*args, **kwargs)
def semantic_search(
query_embeddings: Tensor,
corpus_embeddings: Tensor,
query_chunk_size: int = 100,
corpus_chunk_size: int = 500000,
top_k: int = 10,
score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
) -> List[List[Dict[str, Union[int, float]]]]:
"""
This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.
:param query_embeddings: A 2 dimensional tensor with the query embeddings.
:param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
:param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
:param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
:param top_k: Retrieve top k matching entries.
:param score_function: Function for computing scores. By default, cosine similarity.
:return: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
"""
if isinstance(query_embeddings, (np.ndarray, np.generic)):
query_embeddings = torch.from_numpy(query_embeddings)
elif isinstance(query_embeddings, list):
query_embeddings = torch.stack(query_embeddings)
if len(query_embeddings.shape) == 1:
query_embeddings = query_embeddings.unsqueeze(0)
if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
corpus_embeddings = torch.from_numpy(corpus_embeddings)
elif isinstance(corpus_embeddings, list):
corpus_embeddings = torch.stack(corpus_embeddings)
# Check that corpus and queries are on the same device
if corpus_embeddings.device != query_embeddings.device:
query_embeddings = query_embeddings.to(corpus_embeddings.device)
queries_result_list = [[] for _ in range(len(query_embeddings))]
for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
# Iterate over chunks of the corpus
for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
# Compute cosine similarities
cos_scores = score_function(
query_embeddings[query_start_idx : query_start_idx + query_chunk_size],
corpus_embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
)
# Get top-k scores
cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
cos_scores, min(top_k, len(cos_scores[0])), dim=1, largest=True, sorted=False
)
cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
for query_itr in range(len(cos_scores)):
for sub_corpus_id, score in zip(cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]):
corpus_id = corpus_start_idx + sub_corpus_id
query_id = query_start_idx + query_itr
if len(queries_result_list[query_id]) < top_k:
heapq.heappush(
queries_result_list[query_id], (score, corpus_id)
) # heaqp tracks the quantity of the first element in the tuple
else:
heapq.heappushpop(queries_result_list[query_id], (score, corpus_id))
# change the data format and sort
for query_id in range(len(queries_result_list)):
for doc_itr in range(len(queries_result_list[query_id])):
score, corpus_id = queries_result_list[query_id][doc_itr]
queries_result_list[query_id][doc_itr] = {"corpus_id": corpus_id, "score": score}
queries_result_list[query_id] = sorted(queries_result_list[query_id], key=lambda x: x["score"], reverse=True)
return queries_result_list
def http_get(url, path) -> None:
"""
Downloads a URL to a given path on disc
"""
if os.path.dirname(path) != "":
os.makedirs(os.path.dirname(path), exist_ok=True)
req = requests.get(url, stream=True)
if req.status_code != 200:
print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
req.raise_for_status()
return
download_filepath = path + "_part"
with open(download_filepath, "wb") as file_binary:
content_length = req.headers.get("Content-Length")
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total, unit_scale=True)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
file_binary.write(chunk)
os.rename(download_filepath, path)
progress.close()
def batch_to_device(batch, target_device: device):
"""
send a pytorch batch to a device (CPU/GPU)
"""
for key in batch:
if isinstance(batch[key], Tensor):
batch[key] = batch[key].to(target_device)
return batch
def fullname(o) -> str:
"""
Gives a full name (package_name.class_name) for a class / object in Python. Will
be used to load the correct classes from JSON files
"""
module = o.__class__.__module__
if module is None or module == str.__class__.__module__:
return o.__class__.__name__ # Avoid reporting __builtin__
else:
return module + "." + o.__class__.__name__
def import_from_string(dotted_path):
"""
Import a dotted module path and return the attribute/class designated by the
last name in the path. Raise ImportError if the import failed.
"""
try:
module_path, class_name = dotted_path.rsplit(".", 1)
except ValueError:
msg = "%s doesn't look like a module path" % dotted_path
raise ImportError(msg)
try:
module = importlib.import_module(dotted_path)
except Exception:
module = importlib.import_module(module_path)
try:
return getattr(module, class_name)
except AttributeError:
msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
raise ImportError(msg)
def community_detection(
embeddings, threshold=0.75, min_community_size=10, batch_size=1024, show_progress_bar=False
) -> List[List[int]]:
"""
Function for Fast Community Detection
Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
Returns only communities that are larger than min_community_size. The communities are returned
in decreasing order. The first element in each list is the central point in the community.
"""
if not isinstance(embeddings, torch.Tensor):
embeddings = torch.tensor(embeddings)
threshold = torch.tensor(threshold, device=embeddings.device)
embeddings = normalize_embeddings(embeddings)
extracted_communities = []
# Maximum size for community
min_community_size = min(min_community_size, len(embeddings))
sort_max_size = min(max(2 * min_community_size, 50), len(embeddings))
for start_idx in tqdm(
range(0, len(embeddings), batch_size), desc="Finding clusters", disable=not show_progress_bar
):
# Compute cosine similarity scores
cos_scores = embeddings[start_idx : start_idx + batch_size] @ embeddings.T
# Use a torch-heavy approach if the embeddings are on CUDA, otherwise a loop-heavy one
if embeddings.device.type in ["cuda", "npu"]:
# Threshold the cos scores and determine how many close embeddings exist per embedding
threshold_mask = cos_scores >= threshold
row_wise_count = threshold_mask.sum(1)
# Only consider embeddings with enough close other embeddings
large_enough_mask = row_wise_count >= min_community_size
if not large_enough_mask.any():
continue
row_wise_count = row_wise_count[large_enough_mask]
cos_scores = cos_scores[large_enough_mask]
# The max is the largest potential community, so we use that in topk
k = row_wise_count.max()
_, top_k_indices = cos_scores.topk(k=k, largest=True)
# Use the row-wise count to slice the indices
for count, indices in zip(row_wise_count, top_k_indices):
extracted_communities.append(indices[:count].tolist())
else:
# Minimum size for a community
top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)
# Filter for rows >= min_threshold
for i in range(len(top_k_values)):
if top_k_values[i][-1] >= threshold:
# Only check top k most similar entries
top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
# Check if we need to increase sort_max_size
while top_val_large[-1] > threshold and sort_max_size < len(embeddings):
sort_max_size = min(2 * sort_max_size, len(embeddings))
top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
extracted_communities.append(top_idx_large[top_val_large >= threshold].tolist())
# Largest cluster first
extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)
# Step 2) Remove overlapping communities
unique_communities = []
extracted_ids = set()
for cluster_id, community in enumerate(extracted_communities):
non_overlapped_community = []
for idx in community:
if idx not in extracted_ids:
non_overlapped_community.append(idx)
if len(non_overlapped_community) >= min_community_size:
unique_communities.append(non_overlapped_community)
extracted_ids.update(non_overlapped_community)
unique_communities = sorted(unique_communities, key=lambda x: len(x), reverse=True)
return unique_communities
##################
#
######################
class disabled_tqdm(tqdm):
"""
Class to override `disable` argument in case progress bars are globally disabled.
Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324.
"""
def __init__(self, *args, **kwargs):
kwargs["disable"] = True
super().__init__(*args, **kwargs)
def __delattr__(self, attr: str) -> None:
"""Fix for https://github.com/huggingface/huggingface_hub/issues/1603"""
try:
super().__delattr__(attr)
except AttributeError:
if attr != "_lock":
raise
def is_sentence_transformer_model(
model_name_or_path: str,
token: Optional[Union[bool, str]] = None,
cache_folder: Optional[str] = None,
revision: Optional[str] = None,
) -> bool:
return bool(load_file_path(model_name_or_path, "modules.json", token, cache_folder, revision=revision))
def load_file_path(
model_name_or_path: str,
filename: str,
token: Optional[Union[bool, str]],
cache_folder: Optional[str],
revision: Optional[str] = None,
) -> Optional[str]:
# If file is local
file_path = os.path.join(model_name_or_path, filename)
if os.path.exists(file_path):
return file_path
# If file is remote
try:
return hf_hub_download(
model_name_or_path,
filename=filename,
revision=revision,
library_name="sentence-transformers",
token=token,
cache_dir=cache_folder,
)
except Exception:
return
def load_dir_path(
model_name_or_path: str,
directory: str,
token: Optional[Union[bool, str]],
cache_folder: Optional[str],
revision: Optional[str] = None,
) -> Optional[str]:
# If file is local
dir_path = os.path.join(model_name_or_path, directory)
if os.path.exists(dir_path):
return dir_path
download_kwargs = {
"repo_id": model_name_or_path,
"revision": revision,
"allow_patterns": f"{directory}/**",
"library_name": "sentence-transformers",
"token": token,
"cache_dir": cache_folder,
"tqdm_class": disabled_tqdm,
}
# Try to download from the remote
try:
repo_path = snapshot_download(**download_kwargs)
except Exception:
# Otherwise, try local (i.e. cache) only
download_kwargs["local_files_only"] = True
repo_path = snapshot_download(**download_kwargs)
return os.path.join(repo_path, directory)
def save_to_hub_args_decorator(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
# If repo_id not already set, use repo_name
repo_name = kwargs.pop("repo_name", None)
if repo_name and "repo_id" not in kwargs:
logger.warning(
"Providing a `repo_name` keyword argument to `save_to_hub` is deprecated, please use `repo_id` instead."
)
kwargs["repo_id"] = repo_name
# If positional args are used, adjust for the new "token" keyword argument
if len(args) >= 2:
args = (*args[:2], None, *args[2:])
return func(self, *args, **kwargs)
return wrapper
def get_device_name() -> Literal["mps", "cuda", "npu", "hpu", "cpu"]:
"""
Returns the name of the device where this module is running on.
It's simple implementation that doesn't cover cases when more powerful GPUs are available and
not a primary device ('cuda:0') or MPS device is available, but not configured properly:
https://pytorch.org/docs/master/notes/mps.html
:return: Device name, like 'cuda' or 'cpu'
"""
if torch.cuda.is_available():
return "cuda"
elif torch.backends.mps.is_available():
return "mps"
elif is_torch_npu_available():
return "npu"
elif importlib.util.find_spec("habana_frameworks") is not None:
import habana_frameworks.torch.hpu as hthpu
if hthpu.is_available():
return "hpu"
return "cpu"
[metadata]
description_file = README.md
from setuptools import setup, find_packages
with open("README.md", mode="r", encoding="utf-8") as readme_file:
readme = readme_file.read()
setup(
name="sentence-transformers",
version="2.7.0.dev0",
author="Nils Reimers",
author_email="info@nils-reimers.de",
description="Multilingual text embeddings",
long_description=readme,
long_description_content_type="text/markdown",
license="Apache License 2.0",
url="https://www.SBERT.net",
download_url="https://github.com/UKPLab/sentence-transformers/",
packages=find_packages(),
python_requires=">=3.8.0",
install_requires=[
"transformers>=4.32.0,<5.0.0",
"tqdm",
"torch>=1.11.0",
"numpy",
"scikit-learn",
"scipy",
"huggingface-hub>=0.15.1",
"Pillow",
],
extras_require={
"dev": [
"pre-commit",
"pytest",
"ruff>=0.3.0",
],
},
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.8",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning",
)
import os
import platform
import tempfile
import pytest
from sentence_transformers import SentenceTransformer, CrossEncoder
from sentence_transformers.models import Transformer, Pooling
@pytest.fixture()
def stsb_bert_tiny_model() -> SentenceTransformer:
return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors")
@pytest.fixture(scope="session")
def stsb_bert_tiny_model_reused() -> SentenceTransformer:
return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors")
@pytest.fixture()
def paraphrase_distilroberta_base_v1_model() -> SentenceTransformer:
return SentenceTransformer("paraphrase-distilroberta-base-v1")
@pytest.fixture()
def distilroberta_base_ce_model() -> CrossEncoder:
return CrossEncoder("distilroberta-base", num_labels=1)
@pytest.fixture()
def clip_vit_b_32_model() -> SentenceTransformer:
return SentenceTransformer("clip-ViT-B-32")
@pytest.fixture()
def distilbert_base_uncased_model() -> SentenceTransformer:
word_embedding_model = Transformer("distilbert-base-uncased")
pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
return model
@pytest.fixture()
def cache_dir():
"""
In the CI environment, we use a temporary directory as `cache_dir`
to avoid keeping the downloaded models on disk after the test.
This is only required for Ubuntu, as we otherwise have disk space issues there.
"""
if os.environ.get("CI", None) and platform.system() == "Linux":
with tempfile.TemporaryDirectory() as tmp_dir:
yield tmp_dir
else:
yield None
from contextlib import nullcontext
from typing import List
import pytest
from sentence_transformers import SentenceTransformer, InputExample, losses
import tqdm
from transformers import set_seed
import torch
from torch.optim import Adam
@pytest.mark.parametrize(
["train_samples_mnrl", "train_samples_cmnrl", "same_grad", "scaler", "precision"],
[
(
[
InputExample(texts=[q, p, n])
for q, p, n in zip(
["aaa", "bbb", "ccc", "ddd", "eee"],
["aas", "bbs", "ccs", "dds", "ees"],
["xxx", "yyy", "zzz", "kkk", "fff"],
)
],
[
InputExample(texts=[q, p, n])
for q, p, n in zip(
["aaa", "bbb", "ccc", "ddd", "eee"],
["aas", "bbs", "ccs", "dds", "ees"],
["xxx", "yyy", "zzz", "kkk", "fff"],
)
],
True,
1.0,
1e-6,
),
(
[
InputExample(texts=[q, p, n])
for q, p, n in zip(
["adsa", "czx", "dsada"],
["b", "fas", "xcz"],
["c", "yyy", "asdas"],
)
],
[
InputExample(texts=[q, p, n])
for q, p, n in zip(
["aaa", "bbb", "ccc", "ddd", "eee"],
["aas", "bbs", "ccs", "dds", "ees"],
["xxx", "yyy", "zzz", "kkk", "fff"],
)
],
False,
1.0,
1e-6,
),
(
[
InputExample(texts=[q, p, n])
for q, p, n in zip(
["aaa", "bbb", "ccc", "ddd", "eee"],
["aas", "bbs", "ccs", "dds", "ees"],
["xxx", "yyy", "zzz", "kkk", "fff"],
)
],
[
InputExample(texts=[q, p, n])
for q, p, n in zip(
["aaa", "bbb", "ccc", "ddd", "eee"],
["aas", "bbs", "ccs", "dds", "ees"],
["xxx", "yyy", "zzz", "kkk", "fff"],
)
],
True,
1000.0,
1e-3,
),
],
)
def test_cmnrl_same_grad(
train_samples_mnrl: List[InputExample],
train_samples_cmnrl: List[InputExample],
same_grad: bool,
scaler: float,
precision: float,
):
# Given:
sbert = SentenceTransformer("distilbert-base-uncased")
sbert.to("cpu")
optimizer = Adam(sbert.parameters())
# train_samples_mnrl
# train_samples_cmnrl
# same_grad
# scaler # This simulates AMP scenarios
# precision
# When:
# First run with MNRL
set_seed(42)
optimizer.zero_grad()
loss_mnrl = losses.MultipleNegativesRankingLoss(sbert)
loss_mnrl_value: torch.Tensor = loss_mnrl.forward(*sbert.smart_batching_collate(train_samples_mnrl)) * scaler
loss_mnrl_value.backward()
grad_expected = {name: p.grad.clone() for name, p in loss_mnrl.named_parameters() if p.grad is not None}
# Then run with this cached version:
set_seed(42)
optimizer.zero_grad()
loss_cmnrl = losses.CachedMultipleNegativesRankingLoss(sbert, mini_batch_size=2)
loss_cmnrl_value = loss_cmnrl.forward(*sbert.smart_batching_collate(train_samples_cmnrl)) * scaler
loss_cmnrl_value.backward()
grad = {name: p.grad.clone() for name, p in loss_cmnrl.named_parameters() if p.grad is not None}
# Then:
if same_grad:
assert pytest.approx(loss_mnrl_value.item()) == loss_cmnrl_value.item()
else:
assert pytest.approx(loss_mnrl_value.item()) != loss_cmnrl_value.item()
nclose = 0
for name in tqdm.tqdm(grad_expected):
nclose += torch.allclose(grad[name], grad_expected[name], precision, precision)
if same_grad:
assert nclose == len(grad_expected)
else:
assert nclose != len(grad_expected)
@pytest.mark.parametrize("use_rand_context", [True, False])
def test_rand_context_working(use_rand_context: bool):
# Given:
from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import (
RandContext,
)
a = torch.Tensor(1)
b = torch.Tensor(1)
random_state = RandContext(a, b) if use_rand_context else nullcontext()
expected = torch.rand(1000)
precision = 1e-6
# When:
with random_state:
# Then:
if use_rand_context:
assert torch.allclose(torch.rand(1000), expected, precision, precision)
else:
assert not torch.allclose(torch.rand(1000), expected, precision, precision)
"""
Computes embeddings
"""
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import get_device_name
def test_encode_token_embeddings(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
"""
Test that encode(output_value='token_embeddings') works
:return:
"""
model = paraphrase_distilroberta_base_v1_model
sent = [
"Hello Word, a test sentence",
"Here comes another sentence",
"My final sentence",
"Sentences",
"Sentence five five five five five five five",
]
emb = model.encode(sent, output_value="token_embeddings", batch_size=2)
assert len(emb) == len(sent)
device = get_device_name()
if device == "hpu":
for s, e in zip(sent, emb):
assert len(model.tokenize([s])["input_ids"][0]) == model.get_max_seq_length()
else:
for s, e in zip(sent, emb):
assert len(model.tokenize([s])["input_ids"][0]) == e.shape[0]
def test_encode_single_sentences(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
model = paraphrase_distilroberta_base_v1_model
# Single sentence
emb = model.encode("Hello Word, a test sentence")
assert emb.shape == (768,)
assert abs(np.sum(emb) - 7.9811716) < 0.002
# Single sentence as list
emb = model.encode(["Hello Word, a test sentence"])
assert emb.shape == (1, 768)
assert abs(np.sum(emb) - 7.9811716) < 0.002
# Sentence list
emb = model.encode(
[
"Hello Word, a test sentence",
"Here comes another sentence",
"My final sentence",
]
)
assert emb.shape == (3, 768)
assert abs(np.sum(emb) - 22.968266) < 0.007
def test_encode_normalize(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
model = paraphrase_distilroberta_base_v1_model
emb = model.encode(
[
"Hello Word, a test sentence",
"Here comes another sentence",
"My final sentence",
],
normalize_embeddings=True,
)
assert emb.shape == (3, 768)
for norm in np.linalg.norm(emb, axis=1):
assert abs(norm - 1) < 0.001
def test_encode_tuple_sentences(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
model = paraphrase_distilroberta_base_v1_model
# Input a sentence tuple
emb = model.encode([("Hello Word, a test sentence", "Second input for model")])
assert emb.shape == (1, 768)
assert abs(np.sum(emb) - 9.503508) < 0.002
# List of sentence tuples
emb = model.encode(
[
("Hello Word, a test sentence", "Second input for model"),
("My second tuple", "With two inputs"),
("Final tuple", "final test"),
]
)
assert emb.shape == (3, 768)
assert abs(np.sum(emb) - 32.14627) < 0.002
"""
Tests that the pretrained models produce the correct scores on the STSbenchmark dataset
"""
import csv
import gzip
import os
from pathlib import Path
import tempfile
import pytest
import torch
from torch.utils.data import DataLoader
from sentence_transformers import CrossEncoder, util
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers.readers import InputExample
from typing import Generator, List, Tuple
@pytest.fixture()
def sts_resource() -> Generator[Tuple[List[InputExample], List[InputExample]], None, None]:
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
stsb_train_samples = []
stsb_test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "test":
stsb_test_samples.append(inp_example)
elif row["split"] == "train":
stsb_train_samples.append(inp_example)
yield stsb_train_samples, stsb_test_samples
def evaluate_stsb_test(
distilroberta_base_ce_model: CrossEncoder,
expected_score: float,
test_samples: List[InputExample],
num_test_samples: int = -1,
) -> None:
model = distilroberta_base_ce_model
evaluator = CECorrelationEvaluator.from_input_examples(test_samples[:num_test_samples], name="sts-test")
score = evaluator(model) * 100
print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(score, expected_score))
assert score > expected_score or abs(score - expected_score) < 0.1
def test_pretrained_stsb(sts_resource: Tuple[List[InputExample], List[InputExample]]):
_, sts_test_samples = sts_resource
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
evaluate_stsb_test(model, 87.92, sts_test_samples)
@pytest.mark.slow
def test_train_stsb_slow(
distilroberta_base_ce_model: CrossEncoder, sts_resource: Tuple[List[InputExample], List[InputExample]]
) -> None:
model = distilroberta_base_ce_model
sts_train_samples, sts_test_samples = sts_resource
train_dataloader = DataLoader(sts_train_samples, shuffle=True, batch_size=16)
model.fit(
train_dataloader=train_dataloader,
epochs=1,
warmup_steps=int(len(train_dataloader) * 0.1),
)
evaluate_stsb_test(model, 75, sts_test_samples)
def test_train_stsb(
distilroberta_base_ce_model: CrossEncoder, sts_resource: Tuple[List[InputExample], List[InputExample]]
) -> None:
model = distilroberta_base_ce_model
sts_train_samples, sts_test_samples = sts_resource
train_dataloader = DataLoader(sts_train_samples[:500], shuffle=True, batch_size=16)
model.fit(
train_dataloader=train_dataloader,
epochs=1,
warmup_steps=int(len(train_dataloader) * 0.1),
)
evaluate_stsb_test(model, 50, sts_test_samples, num_test_samples=100)
def test_classifier_dropout_is_set() -> None:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base", classifier_dropout=0.1234)
assert model.config.classifier_dropout == 0.1234
assert model.model.config.classifier_dropout == 0.1234
def test_classifier_dropout_default_value() -> None:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
assert model.config.classifier_dropout is None
assert model.model.config.classifier_dropout is None
def test_load_with_revision() -> None:
model_name = "sentence-transformers-testing/stsb-bert-tiny-safetensors"
main_model = CrossEncoder(model_name, num_labels=1, revision="main")
latest_model = CrossEncoder(
model_name,
num_labels=1,
revision="f3cb857cba53019a20df283396bcca179cf051a4",
)
older_model = CrossEncoder(
model_name,
num_labels=1,
revision="ba33022fdf0b0fc2643263f0726f44d0a07d0e24",
)
# Set the classifier.bias and classifier.weight equal among models. This
# is needed because the AutoModelForSequenceClassification randomly initializes
# the classifier.bias and classifier.weight for each (model) initialization.
# The test is only possible if all models have the same classifier.bias
# and classifier.weight parameters.
latest_model.model.classifier.bias = main_model.model.classifier.bias
latest_model.model.classifier.weight = main_model.model.classifier.weight
older_model.model.classifier.bias = main_model.model.classifier.bias
older_model.model.classifier.weight = main_model.model.classifier.weight
test_sentences = [["Hello there!", "Hello, World!"]]
main_prob = main_model.predict(test_sentences, convert_to_tensor=True)
assert torch.equal(main_prob, latest_model.predict(test_sentences, convert_to_tensor=True))
assert not torch.equal(main_prob, older_model.predict(test_sentences, convert_to_tensor=True))
def test_rank() -> None:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
# We want to compute the similarity between the query sentence
query = "A man is eating pasta."
# With all sentences in the corpus
corpus = [
"A man is eating food.",
"A man is eating a piece of bread.",
"The girl is carrying a baby.",
"A man is riding a horse.",
"A woman is playing violin.",
"Two men pushed carts through the woods.",
"A man is riding a white horse on an enclosed ground.",
"A monkey is playing drums.",
"A cheetah is running behind its prey.",
]
expected_ranking = [0, 1, 3, 6, 2, 5, 7, 4, 8]
# 1. We rank all sentences in the corpus for the query
ranks = model.rank(query, corpus)
pred_ranking = [rank["corpus_id"] for rank in ranks]
assert pred_ranking == expected_ranking
@pytest.mark.parametrize("safe_serialization", [True, False, None])
def test_safe_serialization(safe_serialization: bool) -> None:
with tempfile.TemporaryDirectory() as cache_folder:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
if safe_serialization:
model.save(cache_folder, safe_serialization=safe_serialization)
model_files = list(Path(cache_folder).glob("**/model.safetensors"))
assert 1 == len(model_files)
elif safe_serialization is None:
model.save(cache_folder)
model_files = list(Path(cache_folder).glob("**/model.safetensors"))
assert 1 == len(model_files)
else:
model.save(cache_folder, safe_serialization=safe_serialization)
model_files = list(Path(cache_folder).glob("**/pytorch_model.bin"))
assert 1 == len(model_files)
"""
Tests the correct computation of evaluation scores from BinaryClassificationEvaluator
"""
import csv
import gzip
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader
from sentence_transformers import (
InputExample,
SentenceTransformer,
evaluation,
losses,
util,
)
def test_BinaryClassificationEvaluator_find_best_f1_and_threshold() -> None:
"""Tests that the F1 score for the computed threshold is correct"""
y_true = np.random.randint(0, 2, 1000)
y_pred_cosine = np.random.randn(1000)
(
best_f1,
best_precision,
best_recall,
threshold,
) = evaluation.BinaryClassificationEvaluator.find_best_f1_and_threshold(
y_pred_cosine, y_true, high_score_more_similar=True
)
y_pred_labels = [1 if pred >= threshold else 0 for pred in y_pred_cosine]
sklearn_f1score = f1_score(y_true, y_pred_labels)
assert np.abs(best_f1 - sklearn_f1score) < 1e-6
def test_BinaryClassificationEvaluator_find_best_accuracy_and_threshold() -> None:
"""Tests that the Acc score for the computed threshold is correct"""
y_true = np.random.randint(0, 2, 1000)
y_pred_cosine = np.random.randn(1000)
(
max_acc,
threshold,
) = evaluation.BinaryClassificationEvaluator.find_best_acc_and_threshold(
y_pred_cosine, y_true, high_score_more_similar=True
)
y_pred_labels = [1 if pred >= threshold else 0 for pred in y_pred_cosine]
sklearn_acc = accuracy_score(y_true, y_pred_labels)
assert np.abs(max_acc - sklearn_acc) < 1e-6
def test_LabelAccuracyEvaluator(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
"""Tests that the LabelAccuracyEvaluator can be loaded correctly"""
model = paraphrase_distilroberta_base_v1_model
nli_dataset_path = "datasets/AllNLI.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
dev_samples = []
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
label_id = label2int[row["label"]]
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
if len(dev_samples) >= 100:
break
train_loss = losses.SoftmaxLoss(
model=model,
sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
num_labels=len(label2int),
)
dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=16)
evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader, softmax_model=train_loss)
acc = evaluator(model)
assert acc > 0.2
def test_ParaphraseMiningEvaluator(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
"""Tests that the ParaphraseMiningEvaluator can be loaded"""
model = paraphrase_distilroberta_base_v1_model
sentences = {
0: "Hello World",
1: "Hello World!",
2: "The cat is on the table",
3: "On the table the cat is",
}
data_eval = evaluation.ParaphraseMiningEvaluator(sentences, [(0, 1), (2, 3)])
score = data_eval(model)
assert score > 0.99
"""
Compute image embeddings
"""
import os
from PIL import Image
from sentence_transformers import util, SentenceTransformer
def test_simple_encode(clip_vit_b_32_model: SentenceTransformer) -> None:
model = clip_vit_b_32_model
# Encode an image:
image_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"../examples/applications/image-search/two_dogs_in_snow.jpg",
)
img_emb = model.encode(Image.open(image_filepath))
# Encode text descriptions
text_emb = model.encode(["Two dogs in the snow", "A cat on a table", "A picture of London at night"])
# Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)[0]
assert abs(cos_scores[0] - 0.3069) < 0.01
assert abs(cos_scores[1] - 0.1010) < 0.01
assert abs(cos_scores[2] - 0.1086) < 0.01
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment