First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/sentence_transformers/models/WeightedLayerPooling.py
+++ b/sentence_transformers/models/WeightedLayerPooling.py
+import torch
+from torch import Tensor
+from torch import nn
+from typing import Dict
+import os
+import json
+
+
+class WeightedLayerPooling(nn.Module):
+    """
+    Token embeddings are weighted mean of their different hidden layer representations
+    """
+
+    def __init__(
+        self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights=None
+    ):
+        super(WeightedLayerPooling, self).__init__()
+        self.config_keys = ["word_embedding_dimension", "layer_start", "num_hidden_layers"]
+        self.word_embedding_dimension = word_embedding_dimension
+        self.layer_start = layer_start
+        self.num_hidden_layers = num_hidden_layers
+        self.layer_weights = (
+            layer_weights
+            if layer_weights is not None
+            else nn.Parameter(torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float))
+        )
+
+    def forward(self, features: Dict[str, Tensor]):
+        ft_all_layers = features["all_layer_embeddings"]
+
+        all_layer_embedding = torch.stack(ft_all_layers)
+        all_layer_embedding = all_layer_embedding[self.layer_start :, :, :, :]  # Start from 4th layers output
+
+        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
+        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
+
+        features.update({"token_embeddings": weighted_average})
+        return features
+
+    def get_word_embedding_dimension(self):
+        return self.word_embedding_dimension
+
+    def get_config_dict(self):
+        return {key: self.__dict__[key] for key in self.config_keys}
+
+    def save(self, output_path):
+        with open(os.path.join(output_path, "config.json"), "w") as fOut:
+            json.dump(self.get_config_dict(), fOut, indent=2)
+
+        torch.save(self.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
+
+    @staticmethod
+    def load(input_path):
+        with open(os.path.join(input_path, "config.json")) as fIn:
+            config = json.load(fIn)
+
+        model = WeightedLayerPooling(**config)
+        model.load_state_dict(
+            torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))
+        )
+        return model
--- a/sentence_transformers/models/WordEmbeddings.py
+++ b/sentence_transformers/models/WordEmbeddings.py
+import torch
+from torch import nn
+from typing import List
+import logging
+import gzip
+from tqdm import tqdm
+import numpy as np
+import os
+import json
+from ..util import import_from_string, fullname, http_get
+from .tokenizer import WordTokenizer, WhitespaceTokenizer
+
+
+logger = logging.getLogger(__name__)
+
+
+class WordEmbeddings(nn.Module):
+    def __init__(
+        self,
+        tokenizer: WordTokenizer,
+        embedding_weights,
+        update_embeddings: bool = False,
+        max_seq_length: int = 1000000,
+    ):
+        nn.Module.__init__(self)
+        if isinstance(embedding_weights, list):
+            embedding_weights = np.asarray(embedding_weights)
+
+        if isinstance(embedding_weights, np.ndarray):
+            embedding_weights = torch.from_numpy(embedding_weights)
+
+        num_embeddings, embeddings_dimension = embedding_weights.size()
+        self.embeddings_dimension = embeddings_dimension
+        self.emb_layer = nn.Embedding(num_embeddings, embeddings_dimension)
+        self.emb_layer.load_state_dict({"weight": embedding_weights})
+        self.emb_layer.weight.requires_grad = update_embeddings
+        self.tokenizer = tokenizer
+        self.update_embeddings = update_embeddings
+        self.max_seq_length = max_seq_length
+
+    def forward(self, features):
+        token_embeddings = self.emb_layer(features["input_ids"])
+        cls_tokens = None
+        features.update(
+            {
+                "token_embeddings": token_embeddings,
+                "cls_token_embeddings": cls_tokens,
+                "attention_mask": features["attention_mask"],
+            }
+        )
+        return features
+
+    def tokenize(self, texts: List[str], **kwargs):
+        tokenized_texts = [self.tokenizer.tokenize(text, **kwargs) for text in texts]
+        sentence_lengths = [len(tokens) for tokens in tokenized_texts]
+        max_len = max(sentence_lengths)
+
+        input_ids = []
+        attention_masks = []
+        for tokens in tokenized_texts:
+            padding = [0] * (max_len - len(tokens))
+            input_ids.append(tokens + padding)
+            attention_masks.append([1] * len(tokens) + padding)
+
+        output = {
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_masks, dtype=torch.long),
+            "sentence_lengths": torch.tensor(sentence_lengths, dtype=torch.long),
+        }
+
+        return output
+
+    def get_word_embedding_dimension(self) -> int:
+        return self.embeddings_dimension
+
+    def save(self, output_path: str):
+        with open(os.path.join(output_path, "wordembedding_config.json"), "w") as fOut:
+            json.dump(self.get_config_dict(), fOut, indent=2)
+
+        torch.save(self.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
+        self.tokenizer.save(output_path)
+
+    def get_config_dict(self):
+        return {
+            "tokenizer_class": fullname(self.tokenizer),
+            "update_embeddings": self.update_embeddings,
+            "max_seq_length": self.max_seq_length,
+        }
+
+    @staticmethod
+    def load(input_path: str):
+        with open(os.path.join(input_path, "wordembedding_config.json"), "r") as fIn:
+            config = json.load(fIn)
+
+        tokenizer_class = import_from_string(config["tokenizer_class"])
+        tokenizer = tokenizer_class.load(input_path)
+        weights = torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))
+        embedding_weights = weights["emb_layer.weight"]
+        model = WordEmbeddings(
+            tokenizer=tokenizer, embedding_weights=embedding_weights, update_embeddings=config["update_embeddings"]
+        )
+        return model
+
+    @staticmethod
+    def from_text_file(
+        embeddings_file_path: str,
+        update_embeddings: bool = False,
+        item_separator: str = " ",
+        tokenizer=WhitespaceTokenizer(),
+        max_vocab_size: int = None,
+    ):
+        logger.info("Read in embeddings file {}".format(embeddings_file_path))
+
+        if not os.path.exists(embeddings_file_path):
+            logger.info("{} does not exist, try to download from server".format(embeddings_file_path))
+
+            if "/" in embeddings_file_path or "\\" in embeddings_file_path:
+                raise ValueError("Embeddings file not found: {}".format(embeddings_file_path))
+
+            url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/" + embeddings_file_path
+            http_get(url, embeddings_file_path)
+
+        embeddings_dimension = None
+        vocab = []
+        embeddings = []
+
+        with gzip.open(embeddings_file_path, "rt", encoding="utf8") if embeddings_file_path.endswith(".gz") else open(
+            embeddings_file_path, encoding="utf8"
+        ) as fIn:
+            iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings")
+            for line in iterator:
+                split = line.rstrip().split(item_separator)
+
+                if not vocab and len(split) == 2:  # Handle Word2vec format
+                    continue
+
+                word = split[0]
+
+                if embeddings_dimension is None:
+                    embeddings_dimension = len(split) - 1
+                    vocab.append("PADDING_TOKEN")
+                    embeddings.append(np.zeros(embeddings_dimension))
+
+                if (
+                    len(split) - 1
+                ) != embeddings_dimension:  # Assure that all lines in the embeddings file are of the same length
+                    logger.error(
+                        "ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token."
+                    )
+                    continue
+
+                vector = np.array([float(num) for num in split[1:]])
+                embeddings.append(vector)
+                vocab.append(word)
+
+                if max_vocab_size is not None and max_vocab_size > 0 and len(vocab) > max_vocab_size:
+                    break
+
+            embeddings = np.asarray(embeddings)
+
+            tokenizer.set_vocab(vocab)
+            return WordEmbeddings(
+                tokenizer=tokenizer, embedding_weights=embeddings, update_embeddings=update_embeddings
+            )
--- a/sentence_transformers/models/WordWeights.py
+++ b/sentence_transformers/models/WordWeights.py
+import torch
+from torch import Tensor
+from torch import nn
+from typing import List, Dict
+import os
+import json
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+class WordWeights(nn.Module):
+    """This model can weight word embeddings, for example, with idf-values."""
+
+    def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
+        """
+
+        :param vocab:
+            Vocabulary of the tokenizer
+        :param word_weights:
+            Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
+        :param unknown_word_weight:
+            Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
+        """
+        super(WordWeights, self).__init__()
+        self.config_keys = ["vocab", "word_weights", "unknown_word_weight"]
+        self.vocab = vocab
+        self.word_weights = word_weights
+        self.unknown_word_weight = unknown_word_weight
+
+        weights = []
+        num_unknown_words = 0
+        for word in vocab:
+            weight = unknown_word_weight
+            if word in word_weights:
+                weight = word_weights[word]
+            elif word.lower() in word_weights:
+                weight = word_weights[word.lower()]
+            else:
+                num_unknown_words += 1
+            weights.append(weight)
+
+        logger.info(
+            "{} of {} words without a weighting value. Set weight to {}".format(
+                num_unknown_words, len(vocab), unknown_word_weight
+            )
+        )
+
+        self.emb_layer = nn.Embedding(len(vocab), 1)
+        self.emb_layer.load_state_dict({"weight": torch.FloatTensor(weights).unsqueeze(1)})
+
+    def forward(self, features: Dict[str, Tensor]):
+        attention_mask = features["attention_mask"]
+        token_embeddings = features["token_embeddings"]
+
+        # Compute a weight value for each token
+        token_weights_raw = self.emb_layer(features["input_ids"]).squeeze(-1)
+        token_weights = token_weights_raw * attention_mask.float()
+        token_weights_sum = torch.sum(token_weights, 1)
+
+        # Multiply embedding by token weight value
+        token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
+        token_embeddings = token_embeddings * token_weights_expanded
+
+        features.update({"token_embeddings": token_embeddings, "token_weights_sum": token_weights_sum})
+        return features
+
+    def get_config_dict(self):
+        return {key: self.__dict__[key] for key in self.config_keys}
+
+    def save(self, output_path):
+        with open(os.path.join(output_path, "config.json"), "w") as fOut:
+            json.dump(self.get_config_dict(), fOut, indent=2)
+
+    @staticmethod
+    def load(input_path):
+        with open(os.path.join(input_path, "config.json")) as fIn:
+            config = json.load(fIn)
+
+        return WordWeights(**config)
--- a/sentence_transformers/models/__init__.py
+++ b/sentence_transformers/models/__init__.py
+from .Transformer import Transformer
+from .Asym import Asym
+from .BoW import BoW
+from .CNN import CNN
+from .Dense import Dense
+from .Dropout import Dropout
+from .LayerNorm import LayerNorm
+from .LSTM import LSTM
+from .Normalize import Normalize
+from .Pooling import Pooling
+from .WeightedLayerPooling import WeightedLayerPooling
+from .WordEmbeddings import WordEmbeddings
+from .WordWeights import WordWeights
+from .CLIPModel import CLIPModel
+
+__all__ = [
+    "Transformer",
+    "Asym",
+    "BoW",
+    "CNN",
+    "Dense",
+    "Dropout",
+    "LayerNorm",
+    "LSTM",
+    "Normalize",
+    "Pooling",
+    "WeightedLayerPooling",
+    "WordEmbeddings",
+    "WordWeights",
+    "CLIPModel",
+]
--- a/sentence_transformers/models/tokenizer/PhraseTokenizer.py
+++ b/sentence_transformers/models/tokenizer/PhraseTokenizer.py
+from typing import List, Iterable
+import collections
+import string
+import os
+import json
+import logging
+from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
+from transformers.utils.import_utils import is_nltk_available, NLTK_IMPORT_ERROR
+
+
+logger = logging.getLogger(__name__)
+
+
+class PhraseTokenizer(WordTokenizer):
+    """Tokenizes the text with respect to existent phrases in the vocab.
+
+    This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
+    in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
+    """
+
+    def __init__(
+        self,
+        vocab: Iterable[str] = [],
+        stop_words: Iterable[str] = ENGLISH_STOP_WORDS,
+        do_lower_case: bool = False,
+        ngram_separator: str = "_",
+        max_ngram_length: int = 5,
+    ):
+        if not is_nltk_available():
+            raise ImportError(NLTK_IMPORT_ERROR.format(self.__class__.__name__))
+
+        self.stop_words = set(stop_words)
+        self.do_lower_case = do_lower_case
+        self.ngram_separator = ngram_separator
+        self.max_ngram_length = max_ngram_length
+        self.set_vocab(vocab)
+
+    def get_vocab(self):
+        return self.vocab
+
+    def set_vocab(self, vocab: Iterable[str]):
+        self.vocab = vocab
+        self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
+
+        # Check for ngram in vocab
+        self.ngram_lookup = set()
+        self.ngram_lengths = set()
+        for word in vocab:
+            if self.ngram_separator is not None and self.ngram_separator in word:
+                # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other
+                ngram_count = word.count(self.ngram_separator) + 1
+                if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length:
+                    self.ngram_lookup.add(word)
+                    self.ngram_lengths.add(ngram_count)
+
+        if len(vocab) > 0:
+            logger.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths))
+            logger.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup)))
+
+    def tokenize(self, text: str, **kwargs) -> List[int]:
+        from nltk import word_tokenize
+
+        tokens = word_tokenize(text, preserve_line=True)
+
+        # phrase detection
+        for ngram_len in sorted(self.ngram_lengths, reverse=True):
+            idx = 0
+            while idx <= len(tokens) - ngram_len:
+                ngram = self.ngram_separator.join(tokens[idx : idx + ngram_len])
+                if ngram in self.ngram_lookup:
+                    tokens[idx : idx + ngram_len] = [ngram]
+                elif ngram.lower() in self.ngram_lookup:
+                    tokens[idx : idx + ngram_len] = [ngram.lower()]
+                idx += 1
+
+        # Map tokens to idx, filter stop words
+        tokens_filtered = []
+        for token in tokens:
+            if token in self.stop_words:
+                continue
+            elif token in self.word2idx:
+                tokens_filtered.append(self.word2idx[token])
+                continue
+
+            token = token.lower()
+            if token in self.stop_words:
+                continue
+            elif token in self.word2idx:
+                tokens_filtered.append(self.word2idx[token])
+                continue
+
+            token = token.strip(string.punctuation)
+            if token in self.stop_words:
+                continue
+            elif len(token) > 0 and token in self.word2idx:
+                tokens_filtered.append(self.word2idx[token])
+                continue
+
+        return tokens_filtered
+
+    def save(self, output_path: str):
+        with open(os.path.join(output_path, "phrasetokenizer_config.json"), "w") as fOut:
+            json.dump(
+                {
+                    "vocab": list(self.word2idx.keys()),
+                    "stop_words": list(self.stop_words),
+                    "do_lower_case": self.do_lower_case,
+                    "ngram_separator": self.ngram_separator,
+                    "max_ngram_length": self.max_ngram_length,
+                },
+                fOut,
+            )
+
+    @staticmethod
+    def load(input_path: str):
+        with open(os.path.join(input_path, "phrasetokenizer_config.json"), "r") as fIn:
+            config = json.load(fIn)
+
+        return PhraseTokenizer(**config)
--- a/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py
+++ b/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py
+from typing import List, Iterable
+import collections
+import string
+import os
+import json
+from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
+
+
+class WhitespaceTokenizer(WordTokenizer):
+    """
+    Simple and fast white-space tokenizer. Splits sentence based on white spaces.
+    Punctuation are stripped from tokens.
+    """
+
+    def __init__(
+        self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False
+    ):
+        self.stop_words = set(stop_words)
+        self.do_lower_case = do_lower_case
+        self.set_vocab(vocab)
+
+    def get_vocab(self):
+        return self.vocab
+
+    def set_vocab(self, vocab: Iterable[str]):
+        self.vocab = vocab
+        self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
+
+    def tokenize(self, text: str, **kwargs) -> List[int]:
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = text.split()
+
+        tokens_filtered = []
+        for token in tokens:
+            if token in self.stop_words:
+                continue
+            elif token in self.word2idx:
+                tokens_filtered.append(self.word2idx[token])
+                continue
+
+            token = token.strip(string.punctuation)
+            if token in self.stop_words:
+                continue
+            elif len(token) > 0 and token in self.word2idx:
+                tokens_filtered.append(self.word2idx[token])
+                continue
+
+            token = token.lower()
+            if token in self.stop_words:
+                continue
+            elif token in self.word2idx:
+                tokens_filtered.append(self.word2idx[token])
+                continue
+
+        return tokens_filtered
+
+    def save(self, output_path: str):
+        with open(os.path.join(output_path, "whitespacetokenizer_config.json"), "w") as fOut:
+            json.dump(
+                {
+                    "vocab": list(self.word2idx.keys()),
+                    "stop_words": list(self.stop_words),
+                    "do_lower_case": self.do_lower_case,
+                },
+                fOut,
+            )
+
+    @staticmethod
+    def load(input_path: str):
+        with open(os.path.join(input_path, "whitespacetokenizer_config.json"), "r") as fIn:
+            config = json.load(fIn)
+
+        return WhitespaceTokenizer(**config)
--- a/sentence_transformers/models/tokenizer/WordTokenizer.py
+++ b/sentence_transformers/models/tokenizer/WordTokenizer.py
+from abc import ABC, abstractmethod
+from typing import List, Iterable
+
+ENGLISH_STOP_WORDS = [
+    "!",
+    '"',
+    "''",
+    "``",
+    "#",
+    "$",
+    "%",
+    "&",
+    "'",
+    "(",
+    ")",
+    "*",
+    "+",
+    ",",
+    "-",
+    ".",
+    "/",
+    ":",
+    ";",
+    "<",
+    "=",
+    ">",
+    "?",
+    "@",
+    "[",
+    "\\",
+    "]",
+    "^",
+    "_",
+    "`",
+    "{",
+    "|",
+    "}",
+    "~",
+    "a",
+    "about",
+    "above",
+    "across",
+    "after",
+    "afterwards",
+    "again",
+    "against",
+    "ain",
+    "all",
+    "almost",
+    "alone",
+    "along",
+    "already",
+    "also",
+    "although",
+    "always",
+    "am",
+    "among",
+    "amongst",
+    "amoungst",
+    "amount",
+    "an",
+    "and",
+    "another",
+    "any",
+    "anyhow",
+    "anyone",
+    "anything",
+    "anyway",
+    "anywhere",
+    "are",
+    "aren",
+    "around",
+    "as",
+    "at",
+    "back",
+    "be",
+    "became",
+    "because",
+    "become",
+    "becomes",
+    "becoming",
+    "been",
+    "before",
+    "beforehand",
+    "behind",
+    "being",
+    "below",
+    "beside",
+    "besides",
+    "between",
+    "beyond",
+    "bill",
+    "both",
+    "bottom",
+    "but",
+    "by",
+    "call",
+    "can",
+    "cannot",
+    "cant",
+    "co",
+    "con",
+    "could",
+    "couldn",
+    "couldnt",
+    "cry",
+    "d",
+    "de",
+    "describe",
+    "detail",
+    "did",
+    "didn",
+    "do",
+    "does",
+    "doesn",
+    "doing",
+    "don",
+    "done",
+    "down",
+    "due",
+    "during",
+    "each",
+    "eg",
+    "eight",
+    "either",
+    "eleven",
+    "else",
+    "elsewhere",
+    "empty",
+    "enough",
+    "etc",
+    "even",
+    "ever",
+    "every",
+    "everyone",
+    "everything",
+    "everywhere",
+    "except",
+    "few",
+    "fifteen",
+    "fifty",
+    "fill",
+    "find",
+    "fire",
+    "first",
+    "five",
+    "for",
+    "former",
+    "formerly",
+    "forty",
+    "found",
+    "four",
+    "from",
+    "front",
+    "full",
+    "further",
+    "get",
+    "give",
+    "go",
+    "had",
+    "hadn",
+    "has",
+    "hasn",
+    "hasnt",
+    "have",
+    "haven",
+    "having",
+    "he",
+    "hence",
+    "her",
+    "here",
+    "hereafter",
+    "hereby",
+    "herein",
+    "hereupon",
+    "hers",
+    "herself",
+    "him",
+    "himself",
+    "his",
+    "how",
+    "however",
+    "hundred",
+    "i",
+    "ie",
+    "if",
+    "in",
+    "inc",
+    "indeed",
+    "interest",
+    "into",
+    "is",
+    "isn",
+    "it",
+    "its",
+    "itself",
+    "just",
+    "keep",
+    "last",
+    "latter",
+    "latterly",
+    "least",
+    "less",
+    "ll",
+    "ltd",
+    "m",
+    "ma",
+    "made",
+    "many",
+    "may",
+    "me",
+    "meanwhile",
+    "might",
+    "mightn",
+    "mill",
+    "mine",
+    "more",
+    "moreover",
+    "most",
+    "mostly",
+    "move",
+    "much",
+    "must",
+    "mustn",
+    "my",
+    "myself",
+    "name",
+    "namely",
+    "needn",
+    "neither",
+    "never",
+    "nevertheless",
+    "next",
+    "nine",
+    "no",
+    "nobody",
+    "none",
+    "noone",
+    "nor",
+    "not",
+    "nothing",
+    "now",
+    "nowhere",
+    "o",
+    "of",
+    "off",
+    "often",
+    "on",
+    "once",
+    "one",
+    "only",
+    "onto",
+    "or",
+    "other",
+    "others",
+    "otherwise",
+    "our",
+    "ours",
+    "ourselves",
+    "out",
+    "over",
+    "own",
+    "part",
+    "per",
+    "perhaps",
+    "please",
+    "put",
+    "rather",
+    "re",
+    "s",
+    "same",
+    "see",
+    "seem",
+    "seemed",
+    "seeming",
+    "seems",
+    "serious",
+    "several",
+    "shan",
+    "she",
+    "should",
+    "shouldn",
+    "show",
+    "side",
+    "since",
+    "sincere",
+    "six",
+    "sixty",
+    "so",
+    "some",
+    "somehow",
+    "someone",
+    "something",
+    "sometime",
+    "sometimes",
+    "somewhere",
+    "still",
+    "such",
+    "system",
+    "t",
+    "take",
+    "ten",
+    "than",
+    "that",
+    "the",
+    "their",
+    "theirs",
+    "them",
+    "themselves",
+    "then",
+    "thence",
+    "there",
+    "thereafter",
+    "thereby",
+    "therefore",
+    "therein",
+    "thereupon",
+    "these",
+    "they",
+    "thick",
+    "thin",
+    "third",
+    "this",
+    "those",
+    "though",
+    "three",
+    "through",
+    "throughout",
+    "thru",
+    "thus",
+    "to",
+    "together",
+    "too",
+    "top",
+    "toward",
+    "towards",
+    "twelve",
+    "twenty",
+    "two",
+    "un",
+    "under",
+    "until",
+    "up",
+    "upon",
+    "us",
+    "ve",
+    "very",
+    "via",
+    "was",
+    "wasn",
+    "we",
+    "well",
+    "were",
+    "weren",
+    "what",
+    "whatever",
+    "when",
+    "whence",
+    "whenever",
+    "where",
+    "whereafter",
+    "whereas",
+    "whereby",
+    "wherein",
+    "whereupon",
+    "wherever",
+    "whether",
+    "which",
+    "while",
+    "whither",
+    "who",
+    "whoever",
+    "whole",
+    "whom",
+    "whose",
+    "why",
+    "will",
+    "with",
+    "within",
+    "without",
+    "won",
+    "would",
+    "wouldn",
+    "y",
+    "yet",
+    "you",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+]
+
+
+class WordTokenizer(ABC):
+    @abstractmethod
+    def set_vocab(self, vocab: Iterable[str]):
+        pass
+
+    @abstractmethod
+    def get_vocab(self, vocab: Iterable[str]):
+        pass
+
+    @abstractmethod
+    def tokenize(self, text: str, **kwargs) -> List[int]:
+        pass
+
+    @abstractmethod
+    def save(self, output_path: str):
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def load(input_path: str):
+        pass
--- a/sentence_transformers/models/tokenizer/__init__.py
+++ b/sentence_transformers/models/tokenizer/__init__.py
+from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
+from .WhitespaceTokenizer import WhitespaceTokenizer
+from .PhraseTokenizer import PhraseTokenizer
+
+__all__ = ["WordTokenizer", "WhitespaceTokenizer", "PhraseTokenizer", "ENGLISH_STOP_WORDS"]
--- a/sentence_transformers/quantization.py
+++ b/sentence_transformers/quantization.py
+import time
+from torch import Tensor
+from typing import List, Literal, Tuple, TYPE_CHECKING
+import numpy as np
+import logging
+from typing import Dict, Optional, Union
+
+
+logger = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    import faiss
+    import usearch
+
+
+def semantic_search_faiss(
+    query_embeddings: np.ndarray,
+    corpus_embeddings: Optional[np.ndarray] = None,
+    corpus_index: Optional["faiss.Index"] = None,
+    corpus_precision: Literal["float32", "uint8", "ubinary"] = "float32",
+    top_k: int = 10,
+    ranges: Optional[np.ndarray] = None,
+    calibration_embeddings: Optional[np.ndarray] = None,
+    rescore: bool = True,
+    rescore_multiplier: int = 2,
+    exact: bool = True,
+    output_index: bool = False,
+) -> Tuple[List[List[Dict[str, Union[int, float]]]], float, "faiss.Index"]:
+    """
+    Performs semantic search using the FAISS library.
+
+    Rescoring will be performed if:
+    1. `rescore` is True
+    2. The query embeddings are not quantized
+    3. The corpus is quantized, i.e. the corpus precision is not float32
+    Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
+    keep `top_k`.
+
+    :param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
+    :param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
+    :param corpus_index: FAISS index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        be used, not both.
+    :param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
+        Default is "float32".
+    :param top_k: Number of top results to retrieve. Default is 10.
+    :param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
+        refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
+        Default is None, which means that the ranges will be calculated from the calibration embeddings.
+    :param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
+        quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
+        values for each dimension. Default is None, which means that the ranges will be calculated from the query
+        embeddings. This is not recommended.
+    :param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
+        are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
+    :param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
+        and then rescore to only keep `top_k`. Default is 2.
+    :param exact: Whether to use exact search or approximate search. Default is True.
+    :param output_index: Whether to output the FAISS index used for the search. Default is False.
+
+    :return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
+        the tuple will also contain the FAISS index used for the search.
+    :raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
+
+    The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
+    The time taken for the search is a float value.
+    """
+    import faiss
+
+    if corpus_embeddings is not None and corpus_index is not None:
+        raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.")
+    if corpus_embeddings is None and corpus_index is None:
+        raise ValueError("Either corpus_embeddings or corpus_index should be used.")
+
+    # If corpus_index is not provided, create a new index
+    if corpus_index is None:
+        if corpus_precision in ("float32", "uint8"):
+            if exact:
+                corpus_index = faiss.IndexFlatIP(corpus_embeddings.shape[1])
+            else:
+                corpus_index = faiss.IndexHNSWFlat(corpus_embeddings.shape[1], 16)
+
+        elif corpus_precision == "ubinary":
+            if exact:
+                corpus_index = faiss.IndexBinaryFlat(corpus_embeddings.shape[1] * 8)
+            else:
+                corpus_index = faiss.IndexBinaryHNSW(corpus_embeddings.shape[1] * 8, 16)
+
+        corpus_index.add(corpus_embeddings)
+
+    # If rescoring is enabled and the query embeddings are in float32, we need to quantize them
+    # to the same precision as the corpus embeddings. Also update the top_k value to account for the
+    # rescore_multiplier
+    rescore_embeddings = None
+    k = top_k
+    if query_embeddings.dtype not in (np.uint8, np.int8):
+        if rescore:
+            if corpus_precision != "float32":
+                rescore_embeddings = query_embeddings
+                k *= rescore_multiplier
+            else:
+                logger.warning(
+                    "Rescoring is enabled but the corpus is not quantized. Either pass `rescore=False` or "
+                    'quantize the corpus embeddings with `quantize_embeddings(embeddings, precision="...") `'
+                    'and pass `corpus_precision="..."` to `semantic_search_faiss`.'
+                )
+
+        query_embeddings = quantize_embeddings(
+            query_embeddings,
+            precision=corpus_precision,
+            ranges=ranges,
+            calibration_embeddings=calibration_embeddings,
+        )
+    elif rescore:
+        logger.warning(
+            "Rescoring is enabled but the query embeddings are quantized. Either pass `rescore=False` or don't quantize the query embeddings."
+        )
+
+    # Perform the search using the usearch index
+    start_t = time.time()
+    scores, indices = corpus_index.search(query_embeddings, k)
+
+    # If rescoring is enabled, we need to rescore the results using the rescore_embeddings
+    if rescore_embeddings is not None:
+        top_k_embeddings = np.array(
+            [[corpus_index.reconstruct(idx.item()) for idx in query_indices] for query_indices in indices]
+        )
+        # If the corpus precision is binary, we need to unpack the bits
+        if corpus_precision == "ubinary":
+            top_k_embeddings = np.unpackbits(top_k_embeddings, axis=-1).astype(int)
+        else:
+            top_k_embeddings = top_k_embeddings.astype(int)
+
+        # rescore_embeddings: [num_queries, embedding_dim]
+        # top_k_embeddings: [num_queries, top_k, embedding_dim]
+        # updated_scores: [num_queries, top_k]
+        # We use einsum to calculate the dot product between the query and the top_k embeddings, equivalent to looping
+        # over the queries and calculating 'rescore_embeddings[i] @ top_k_embeddings[i].T'
+        rescored_scores = np.einsum("ij,ikj->ik", rescore_embeddings, top_k_embeddings)
+        rescored_indices = np.argsort(-rescored_scores)[:, :top_k]
+        indices = indices[np.arange(len(query_embeddings))[:, None], rescored_indices]
+        scores = rescored_scores[np.arange(len(query_embeddings))[:, None], rescored_indices]
+
+    delta_t = time.time() - start_t
+
+    outputs = (
+        [
+            [
+                {"corpus_id": int(neighbor), "score": float(score)}
+                for score, neighbor in zip(scores[query_id], indices[query_id])
+            ]
+            for query_id in range(len(query_embeddings))
+        ],
+        delta_t,
+    )
+    if output_index:
+        outputs = (*outputs, corpus_index)
+    return outputs
+
+
+def semantic_search_usearch(
+    query_embeddings: np.ndarray,
+    corpus_embeddings: Optional[np.ndarray] = None,
+    corpus_index: Optional["usearch.index.Index"] = None,
+    corpus_precision: Literal["float32", "int8", "binary"] = "float32",
+    top_k: int = 10,
+    ranges: Optional[np.ndarray] = None,
+    calibration_embeddings: Optional[np.ndarray] = None,
+    rescore: bool = True,
+    rescore_multiplier: int = 2,
+    exact: bool = True,
+    output_index: bool = False,
+) -> Tuple[List[List[Dict[str, Union[int, float]]]], float, "usearch.index.Index"]:
+    """
+    Performs semantic search using the usearch library.
+
+    Rescoring will be performed if:
+    1. `rescore` is True
+    2. The query embeddings are not quantized
+    3. The corpus is quantized, i.e. the corpus precision is not float32
+    Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
+    keep `top_k`.
+
+    :param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
+    :param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
+    :param corpus_index: usearch index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        be used, not both.
+    :param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
+        Default is "float32".
+    :param top_k: Number of top results to retrieve. Default is 10.
+    :param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
+        refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
+        Default is None, which means that the ranges will be calculated from the calibration embeddings.
+    :param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
+        quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
+        values for each dimension. Default is None, which means that the ranges will be calculated from the query
+        embeddings. This is not recommended.
+    :param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
+        are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
+    :param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
+        and then rescore to only keep `top_k`. Default is 2.
+    :param exact: Whether to use exact search or approximate search. Default is True.
+    :param output_index: Whether to output the usearch index used for the search. Default is False.
+
+    :return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
+        the tuple will also contain the usearch index used for the search.
+    :raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
+
+    The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
+    The time taken for the search is a float value.
+    """
+    from usearch.index import Index
+    from usearch.compiled import ScalarKind
+
+    if corpus_embeddings is not None and corpus_index is not None:
+        raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.")
+    if corpus_embeddings is None and corpus_index is None:
+        raise ValueError("Either corpus_embeddings or corpus_index should be used.")
+    if corpus_precision not in ["float32", "int8", "binary"]:
+        raise ValueError('corpus_precision must be "float32", "int8", or "binary" for usearch')
+
+    # If corpus_index is not provided, create a new index
+    if corpus_index is None:
+        if corpus_precision == "float32":
+            corpus_index = Index(
+                ndim=corpus_embeddings.shape[1],
+                metric="cos",
+                dtype="f32",
+            )
+        elif corpus_precision == "int8":
+            corpus_index = Index(
+                ndim=corpus_embeddings.shape[1],
+                metric="ip",
+                dtype="i8",
+            )
+        elif corpus_precision == "binary":
+            corpus_index = Index(
+                ndim=corpus_embeddings.shape[1],
+                metric="hamming",
+                dtype="i8",
+            )
+        corpus_index.add(np.arange(len(corpus_embeddings)), corpus_embeddings)
+
+    # If rescoring is enabled and the query embeddings are in float32, we need to quantize them
+    # to the same precision as the corpus embeddings. Also update the top_k value to account for the
+    # rescore_multiplier
+    rescore_embeddings = None
+    k = top_k
+    if query_embeddings.dtype not in (np.uint8, np.int8):
+        if rescore:
+            if corpus_index.dtype != ScalarKind.F32:
+                rescore_embeddings = query_embeddings
+                k *= rescore_multiplier
+            else:
+                logger.warning(
+                    "Rescoring is enabled but the corpus is not quantized. Either pass `rescore=False` or "
+                    'quantize the corpus embeddings with `quantize_embeddings(embeddings, precision="...") `'
+                    'and pass `corpus_precision="..."` to `semantic_search_usearch`.'
+                )
+
+        query_embeddings = quantize_embeddings(
+            query_embeddings,
+            precision=corpus_precision,
+            ranges=ranges,
+            calibration_embeddings=calibration_embeddings,
+        )
+    elif rescore:
+        logger.warning(
+            "Rescoring is enabled but the query embeddings are quantized. Either pass `rescore=False` or don't quantize the query embeddings."
+        )
+
+    # Perform the search using the usearch index
+    start_t = time.time()
+    matches = corpus_index.search(query_embeddings, count=k, exact=exact)
+    scores = matches.distances
+    indices = matches.keys
+
+    if scores.ndim < 2:
+        scores = np.atleast_2d(scores)
+    if indices.ndim < 2:
+        indices = np.atleast_2d(indices)
+
+    # If rescoring is enabled, we need to rescore the results using the rescore_embeddings
+    if rescore_embeddings is not None:
+        top_k_embeddings = np.array([corpus_index.get(query_indices) for query_indices in indices])
+        # If the corpus precision is binary, we need to unpack the bits
+        if corpus_precision == "binary":
+            top_k_embeddings = np.unpackbits(top_k_embeddings.astype(np.uint8), axis=-1)
+        top_k_embeddings = top_k_embeddings.astype(int)
+
+        # rescore_embeddings: [num_queries, embedding_dim]
+        # top_k_embeddings: [num_queries, top_k, embedding_dim]
+        # updated_scores: [num_queries, top_k]
+        # We use einsum to calculate the dot product between the query and the top_k embeddings, equivalent to looping
+        # over the queries and calculating 'rescore_embeddings[i] @ top_k_embeddings[i].T'
+        rescored_scores = np.einsum("ij,ikj->ik", rescore_embeddings, top_k_embeddings)
+        rescored_indices = np.argsort(-rescored_scores)[:, :top_k]
+        indices = indices[np.arange(len(query_embeddings))[:, None], rescored_indices]
+        scores = rescored_scores[np.arange(len(query_embeddings))[:, None], rescored_indices]
+
+    delta_t = time.time() - start_t
+
+    outputs = (
+        [
+            [
+                {"corpus_id": int(neighbor), "score": float(score)}
+                for score, neighbor in zip(scores[query_id], indices[query_id])
+            ]
+            for query_id in range(len(query_embeddings))
+        ],
+        delta_t,
+    )
+    if output_index:
+        outputs = (*outputs, corpus_index)
+    return outputs
+
+
+def quantize_embeddings(
+    embeddings: Union[Tensor, np.ndarray],
+    precision: Literal["float32", "int8", "uint8", "binary", "ubinary"],
+    ranges: Optional[np.ndarray] = None,
+    calibration_embeddings: Optional[np.ndarray] = None,
+) -> np.ndarray:
+    """
+    Quantizes embeddings to a lower precision. This can be used to reduce the memory footprint and increase the
+    speed of similarity search. The supported precisions are "float32", "int8", "uint8", "binary", and "ubinary".
+
+    :param embeddings: Unquantized (e.g. float) embeddings with to quantize to a given precision
+    :param precision: The precision to convert to. Options are "float32", "int8", "uint8", "binary", "ubinary".
+    :param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
+        refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
+        Default is None, which means that the ranges will be calculated from the calibration embeddings.
+    :type ranges: Optional[np.ndarray]
+    :param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
+        quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
+        values for each dimension. Default is None, which means that the ranges will be calculated from the query
+        embeddings. This is not recommended.
+    :type calibration_embeddings: Optional[np.ndarray]
+    :return: Quantized embeddings with the specified precision
+    """
+    if isinstance(embeddings, Tensor):
+        embeddings = embeddings.cpu().numpy()
+    elif isinstance(embeddings, list):
+        if isinstance(embeddings[0], Tensor):
+            embeddings = [embedding.cpu().numpy() for embedding in embeddings]
+        embeddings = np.array(embeddings)
+    if embeddings.dtype in (np.uint8, np.int8):
+        raise Exception("Embeddings to quantize must be float rather than int8 or uint8.")
+
+    if precision == "float32":
+        return embeddings.astype(np.float32)
+
+    if precision.endswith("int8"):
+        # Either use the 1. provided ranges, 2. the calibration dataset or 3. the provided embeddings
+        if ranges is None:
+            if calibration_embeddings is not None:
+                ranges = np.vstack((np.min(calibration_embeddings, axis=0), np.max(calibration_embeddings, axis=0)))
+            else:
+                if embeddings.shape[0] < 100:
+                    logger.warning(
+                        f"Computing {precision} quantization buckets based on {len(embeddings)} embedding{'s' if len(embeddings) != 1 else ''}."
+                        f" {precision} quantization is more stable with `ranges` calculated from more embeddings "
+                        "or a `calibration_embeddings` that can be used to calculate the buckets."
+                    )
+                ranges = np.vstack((np.min(embeddings, axis=0), np.max(embeddings, axis=0)))
+        starts = ranges[0, :]
+        steps = (ranges[1, :] - ranges[0, :]) / 255
+
+        if precision == "uint8":
+            return ((embeddings - starts) / steps).astype(np.uint8)
+        elif precision == "int8":
+            return ((embeddings - starts) / steps - 128).astype(np.int8)
+
+    if precision == "binary":
+        return (np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) - 128).astype(np.int8)
+
+    if precision == "ubinary":
+        return np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
+
+    raise ValueError(f"Precision {precision} is not supported")
--- a/sentence_transformers/readers/InputExample.py
+++ b/sentence_transformers/readers/InputExample.py
+from typing import Union, List
+
+
+class InputExample:
+    """
+    Structure for one input example with texts, the label and a unique id
+    """
+
+    def __init__(self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0):
+        """
+        Creates one InputExample with the given texts, guid and label
+
+
+        :param guid
+            id for the example
+        :param texts
+            the texts for the example.
+        :param label
+            the label for the example
+        """
+        self.guid = guid
+        self.texts = texts
+        self.label = label
+
+    def __str__(self):
+        return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))
--- a/sentence_transformers/readers/LabelSentenceReader.py
+++ b/sentence_transformers/readers/LabelSentenceReader.py
+from . import InputExample
+import os
+
+
+class LabelSentenceReader:
+    """Reads in a file that has at least two columns: a label and a sentence.
+    This reader can for example be used with the BatchHardTripletLoss.
+    Maps labels automatically to integers"""
+
+    def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator="\t"):
+        self.folder = folder
+        self.label_map = {}
+        self.label_col_idx = label_col_idx
+        self.sentence_col_idx = sentence_col_idx
+        self.separator = separator
+
+    def get_examples(self, filename, max_examples=0):
+        examples = []
+
+        id = 0
+        for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
+            splits = line.strip().split(self.separator)
+            label = splits[self.label_col_idx]
+            sentence = splits[self.sentence_col_idx]
+
+            if label not in self.label_map:
+                self.label_map[label] = len(self.label_map)
+
+            label_id = self.label_map[label]
+            guid = "%s-%d" % (filename, id)
+            id += 1
+            examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
+
+            if 0 < max_examples <= id:
+                break
+
+        return examples
--- a/sentence_transformers/readers/NLIDataReader.py
+++ b/sentence_transformers/readers/NLIDataReader.py
+from . import InputExample
+import gzip
+import os
+
+
+class NLIDataReader(object):
+    """
+    Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
+    """
+
+    def __init__(self, dataset_folder):
+        self.dataset_folder = dataset_folder
+
+    def get_examples(self, filename, max_examples=0):
+        """
+        data_splits specified which data split to use (train, dev, test).
+        Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
+        labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
+        """
+        s1 = gzip.open(os.path.join(self.dataset_folder, "s1." + filename), mode="rt", encoding="utf-8").readlines()
+        s2 = gzip.open(os.path.join(self.dataset_folder, "s2." + filename), mode="rt", encoding="utf-8").readlines()
+        labels = gzip.open(
+            os.path.join(self.dataset_folder, "labels." + filename), mode="rt", encoding="utf-8"
+        ).readlines()
+
+        examples = []
+        id = 0
+        for sentence_a, sentence_b, label in zip(s1, s2, labels):
+            guid = "%s-%d" % (filename, id)
+            id += 1
+            examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
+
+            if 0 < max_examples <= len(examples):
+                break
+
+        return examples
+
+    @staticmethod
+    def get_labels():
+        return {"contradiction": 0, "entailment": 1, "neutral": 2}
+
+    def get_num_labels(self):
+        return len(self.get_labels())
+
+    def map_label(self, label):
+        return self.get_labels()[label.strip().lower()]
--- a/sentence_transformers/readers/PairedFilesReader.py
+++ b/sentence_transformers/readers/PairedFilesReader.py
+from . import InputExample
+import gzip
+
+
+class PairedFilesReader(object):
+    """
+    Reads in the a Pair Dataset, split in two files
+    """
+
+    def __init__(self, filepaths):
+        self.filepaths = filepaths
+
+    def get_examples(self, max_examples=0):
+        """ """
+        fIns = []
+        for filepath in self.filepaths:
+            fIn = (
+                gzip.open(filepath, "rt", encoding="utf-8")
+                if filepath.endswith(".gz")
+                else open(filepath, encoding="utf-8")
+            )
+            fIns.append(fIn)
+
+        examples = []
+
+        eof = False
+        while not eof:
+            texts = []
+            for fIn in fIns:
+                text = fIn.readline()
+
+                if text == "":
+                    eof = True
+                    break
+
+                texts.append(text)
+
+            if eof:
+                break
+
+            examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
+            if max_examples > 0 and len(examples) >= max_examples:
+                break
+
+        return examples
--- a/sentence_transformers/readers/STSDataReader.py
+++ b/sentence_transformers/readers/STSDataReader.py
+from . import InputExample
+import csv
+import gzip
+import os
+
+
+class STSDataReader:
+    """
+    Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
+
+    Default values expects a tab separated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
+    """
+
+    def __init__(
+        self,
+        dataset_folder,
+        s1_col_idx=0,
+        s2_col_idx=1,
+        score_col_idx=2,
+        delimiter="\t",
+        quoting=csv.QUOTE_NONE,
+        normalize_scores=True,
+        min_score=0,
+        max_score=5,
+    ):
+        self.dataset_folder = dataset_folder
+        self.score_col_idx = score_col_idx
+        self.s1_col_idx = s1_col_idx
+        self.s2_col_idx = s2_col_idx
+        self.delimiter = delimiter
+        self.quoting = quoting
+        self.normalize_scores = normalize_scores
+        self.min_score = min_score
+        self.max_score = max_score
+
+    def get_examples(self, filename, max_examples=0):
+        """
+        filename specified which data split to use (train.csv, dev.csv, test.csv).
+        """
+        filepath = os.path.join(self.dataset_folder, filename)
+        with gzip.open(filepath, "rt", encoding="utf8") if filename.endswith(".gz") else open(
+            filepath, encoding="utf-8"
+        ) as fIn:
+            data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
+            examples = []
+            for id, row in enumerate(data):
+                score = float(row[self.score_col_idx])
+                if self.normalize_scores:  # Normalize to a 0...1 value
+                    score = (score - self.min_score) / (self.max_score - self.min_score)
+
+                s1 = row[self.s1_col_idx]
+                s2 = row[self.s2_col_idx]
+                examples.append(InputExample(guid=filename + str(id), texts=[s1, s2], label=score))
+
+                if max_examples > 0 and len(examples) >= max_examples:
+                    break
+
+        return examples
+
+
+class STSBenchmarkDataReader(STSDataReader):
+    """
+    Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
+    Scores are normalized from 0...5 to 0...1
+    """
+
+    def __init__(
+        self,
+        dataset_folder,
+        s1_col_idx=5,
+        s2_col_idx=6,
+        score_col_idx=4,
+        delimiter="\t",
+        quoting=csv.QUOTE_NONE,
+        normalize_scores=True,
+        min_score=0,
+        max_score=5,
+    ):
+        super().__init__(
+            dataset_folder=dataset_folder,
+            s1_col_idx=s1_col_idx,
+            s2_col_idx=s2_col_idx,
+            score_col_idx=score_col_idx,
+            delimiter=delimiter,
+            quoting=quoting,
+            normalize_scores=normalize_scores,
+            min_score=min_score,
+            max_score=max_score,
+        )
--- a/sentence_transformers/readers/TripletReader.py
+++ b/sentence_transformers/readers/TripletReader.py
+from . import InputExample
+import csv
+import os
+
+
+class TripletReader(object):
+    """
+    Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
+    one positive example (s2) and one negative example (s3)
+    """
+
+    def __init__(
+        self,
+        dataset_folder,
+        s1_col_idx=0,
+        s2_col_idx=1,
+        s3_col_idx=2,
+        has_header=False,
+        delimiter="\t",
+        quoting=csv.QUOTE_NONE,
+    ):
+        self.dataset_folder = dataset_folder
+        self.s1_col_idx = s1_col_idx
+        self.s2_col_idx = s2_col_idx
+        self.s3_col_idx = s3_col_idx
+        self.has_header = has_header
+        self.delimiter = delimiter
+        self.quoting = quoting
+
+    def get_examples(self, filename, max_examples=0):
+        """ """
+        data = csv.reader(
+            open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
+            delimiter=self.delimiter,
+            quoting=self.quoting,
+        )
+        examples = []
+        if self.has_header:
+            next(data)
+
+        for id, row in enumerate(data):
+            s1 = row[self.s1_col_idx]
+            s2 = row[self.s2_col_idx]
+            s3 = row[self.s3_col_idx]
+
+            examples.append(InputExample(texts=[s1, s2, s3]))
+            if max_examples > 0 and len(examples) >= max_examples:
+                break
+
+        return examples
--- a/sentence_transformers/readers/__init__.py
+++ b/sentence_transformers/readers/__init__.py
+from .InputExample import InputExample
+from .LabelSentenceReader import LabelSentenceReader
+from .NLIDataReader import NLIDataReader
+from .STSDataReader import STSDataReader, STSBenchmarkDataReader
+from .TripletReader import TripletReader
+
+__all__ = [
+    "InputExample",
+    "LabelSentenceReader",
+    "NLIDataReader",
+    "STSDataReader",
+    "STSBenchmarkDataReader",
+    "TripletReader",
+]
--- a/sentence_transformers/util.py
+++ b/sentence_transformers/util.py
+import functools
+import requests
+from torch import Tensor, device
+from typing import List, Callable, Literal
+from tqdm.autonotebook import tqdm
+import sys
+import importlib
+import os
+import torch
+import numpy as np
+import queue
+import logging
+from typing import Dict, Optional, Union, overload
+
+from transformers import is_torch_npu_available
+from huggingface_hub import snapshot_download, hf_hub_download
+import heapq
+
+logger = logging.getLogger(__name__)
+
+
+def pytorch_cos_sim(a: Tensor, b: Tensor) -> Tensor:
+    """
+    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
+
+    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
+    """
+    return cos_sim(a, b)
+
+
+def cos_sim(a: Tensor, b: Tensor) -> Tensor:
+    """
+    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
+
+    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+
+    if len(a.shape) == 1:
+        a = a.unsqueeze(0)
+
+    if len(b.shape) == 1:
+        b = b.unsqueeze(0)
+
+    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
+    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
+    return torch.mm(a_norm, b_norm.transpose(0, 1))
+
+
+def dot_score(a: Tensor, b: Tensor) -> Tensor:
+    """
+    Computes the dot-product dot_prod(a[i], b[j]) for all i and j.
+
+    :return: Matrix with res[i][j]  = dot_prod(a[i], b[j])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+
+    if len(a.shape) == 1:
+        a = a.unsqueeze(0)
+
+    if len(b.shape) == 1:
+        b = b.unsqueeze(0)
+
+    return torch.mm(a, b.transpose(0, 1))
+
+
+def pairwise_dot_score(a: Tensor, b: Tensor) -> Tensor:
+    """
+    Computes the pairwise dot-product dot_prod(a[i], b[i])
+
+    :return: Vector with res[i] = dot_prod(a[i], b[i])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+
+    return (a * b).sum(dim=-1)
+
+
+def pairwise_cos_sim(a: Tensor, b: Tensor) -> Tensor:
+    """
+    Computes the pairwise cossim cos_sim(a[i], b[i])
+
+    :return: Vector with res[i] = cos_sim(a[i], b[i])
+    """
+    if not isinstance(a, torch.Tensor):
+        a = torch.tensor(a)
+
+    if not isinstance(b, torch.Tensor):
+        b = torch.tensor(b)
+
+    return pairwise_dot_score(normalize_embeddings(a), normalize_embeddings(b))
+
+
+def pairwise_angle_sim(x: Tensor, y: Tensor) -> Tensor:
+    """
+    Computes the absolute normalized angle distance;
+    see AnglELoss or https://arxiv.org/abs/2309.12871v1
+    for more information.
+
+    :return: Vector with res[i] = angle_sim(a[i], b[i])
+    """
+
+    if not isinstance(x, torch.Tensor):
+        x = torch.tensor(x)
+
+    if not isinstance(y, torch.Tensor):
+        y = torch.tensor(y)
+
+    # modified from https://github.com/SeanLee97/AnglE/blob/main/angle_emb/angle.py
+    # chunk both tensors to obtain complex components
+    a, b = torch.chunk(x, 2, dim=1)
+    c, d = torch.chunk(y, 2, dim=1)
+
+    z = torch.sum(c**2 + d**2, dim=1, keepdim=True)
+    re = (a * c + b * d) / z
+    im = (b * c - a * d) / z
+
+    dz = torch.sum(a**2 + b**2, dim=1, keepdim=True) ** 0.5
+    dw = torch.sum(c**2 + d**2, dim=1, keepdim=True) ** 0.5
+    re /= dz / dw
+    im /= dz / dw
+
+    norm_angle = torch.sum(torch.concat((re, im), dim=1), dim=1)
+    return torch.abs(norm_angle)
+
+
+def normalize_embeddings(embeddings: Tensor) -> Tensor:
+    """
+    Normalizes the embeddings matrix, so that each sentence embedding has unit length
+    """
+    return torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+
+@overload
+def truncate_embeddings(embeddings: np.ndarray, truncate_dim: Optional[int]) -> np.ndarray: ...
+
+
+@overload
+def truncate_embeddings(embeddings: torch.Tensor, truncate_dim: Optional[int]) -> torch.Tensor: ...
+
+
+def truncate_embeddings(
+    embeddings: Union[np.ndarray, torch.Tensor], truncate_dim: Optional[int]
+) -> Union[np.ndarray, torch.Tensor]:
+    """
+    :param embeddings: Embeddings to truncate.
+    :param truncate_dim: The dimension to truncate sentence embeddings to. `None` does no truncation.
+    :return: Truncated embeddings.
+    """
+    return embeddings[..., :truncate_dim]
+
+
+def paraphrase_mining(
+    model, sentences: List[str], show_progress_bar: bool = False, batch_size: int = 32, *args, **kwargs
+) -> List[List[Union[float, int]]]:
+    """
+    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
+    other sentences and returns a list with the pairs that have the highest cosine similarity score.
+
+    :param model: SentenceTransformer model for embedding computation
+    :param sentences: A list of strings (texts or sentences)
+    :param show_progress_bar: Plotting of a progress bar
+    :param batch_size: Number of texts that are encoded simultaneously by the model
+    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
+    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
+    :param max_pairs: Maximal number of text pairs returned.
+    :param top_k: For each sentence, we retrieve up to top_k other sentences
+    :param score_function: Function for computing scores. By default, cosine similarity.
+    :return: Returns a list of triplets with the format [score, id1, id2]
+    """
+
+    # Compute embedding for the sentences
+    embeddings = model.encode(
+        sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True
+    )
+
+    return paraphrase_mining_embeddings(embeddings, *args, **kwargs)
+
+
+def paraphrase_mining_embeddings(
+    embeddings: Tensor,
+    query_chunk_size: int = 5000,
+    corpus_chunk_size: int = 100000,
+    max_pairs: int = 500000,
+    top_k: int = 100,
+    score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
+) -> List[List[Union[float, int]]]:
+    """
+    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
+    other sentences and returns a list with the pairs that have the highest cosine similarity score.
+
+    :param embeddings: A tensor with the embeddings
+    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
+    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
+    :param max_pairs: Maximal number of text pairs returned.
+    :param top_k: For each sentence, we retrieve up to top_k other sentences
+    :param score_function: Function for computing scores. By default, cosine similarity.
+    :return: Returns a list of triplets with the format [score, id1, id2]
+    """
+
+    top_k += 1  # A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs
+
+    # Mine for duplicates
+    pairs = queue.PriorityQueue()
+    min_score = -1
+    num_added = 0
+
+    for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
+        for query_start_idx in range(0, len(embeddings), query_chunk_size):
+            scores = score_function(
+                embeddings[query_start_idx : query_start_idx + query_chunk_size],
+                embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
+            )
+
+            scores_top_k_values, scores_top_k_idx = torch.topk(
+                scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False
+            )
+            scores_top_k_values = scores_top_k_values.cpu().tolist()
+            scores_top_k_idx = scores_top_k_idx.cpu().tolist()
+
+            for query_itr in range(len(scores)):
+                for top_k_idx, corpus_itr in enumerate(scores_top_k_idx[query_itr]):
+                    i = query_start_idx + query_itr
+                    j = corpus_start_idx + corpus_itr
+
+                    if i != j and scores_top_k_values[query_itr][top_k_idx] > min_score:
+                        pairs.put((scores_top_k_values[query_itr][top_k_idx], i, j))
+                        num_added += 1
+
+                        if num_added >= max_pairs:
+                            entry = pairs.get()
+                            min_score = entry[0]
+
+    # Get the pairs
+    added_pairs = set()  # Used for duplicate detection
+    pairs_list = []
+    while not pairs.empty():
+        score, i, j = pairs.get()
+        sorted_i, sorted_j = sorted([i, j])
+
+        if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
+            added_pairs.add((sorted_i, sorted_j))
+            pairs_list.append([score, sorted_i, sorted_j])
+
+    # Highest scores first
+    pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
+    return pairs_list
+
+
+def information_retrieval(*args, **kwargs) -> List[List[Dict[str, Union[int, float]]]]:
+    """This function is deprecated. Use semantic_search instead"""
+    return semantic_search(*args, **kwargs)
+
+
+def semantic_search(
+    query_embeddings: Tensor,
+    corpus_embeddings: Tensor,
+    query_chunk_size: int = 100,
+    corpus_chunk_size: int = 500000,
+    top_k: int = 10,
+    score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim,
+) -> List[List[Dict[str, Union[int, float]]]]:
+    """
+    This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
+    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.
+
+    :param query_embeddings: A 2 dimensional tensor with the query embeddings.
+    :param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
+    :param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
+    :param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
+    :param top_k: Retrieve top k matching entries.
+    :param score_function: Function for computing scores. By default, cosine similarity.
+    :return: Returns a list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
+    """
+
+    if isinstance(query_embeddings, (np.ndarray, np.generic)):
+        query_embeddings = torch.from_numpy(query_embeddings)
+    elif isinstance(query_embeddings, list):
+        query_embeddings = torch.stack(query_embeddings)
+
+    if len(query_embeddings.shape) == 1:
+        query_embeddings = query_embeddings.unsqueeze(0)
+
+    if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
+        corpus_embeddings = torch.from_numpy(corpus_embeddings)
+    elif isinstance(corpus_embeddings, list):
+        corpus_embeddings = torch.stack(corpus_embeddings)
+
+    # Check that corpus and queries are on the same device
+    if corpus_embeddings.device != query_embeddings.device:
+        query_embeddings = query_embeddings.to(corpus_embeddings.device)
+
+    queries_result_list = [[] for _ in range(len(query_embeddings))]
+
+    for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
+        # Iterate over chunks of the corpus
+        for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
+            # Compute cosine similarities
+            cos_scores = score_function(
+                query_embeddings[query_start_idx : query_start_idx + query_chunk_size],
+                corpus_embeddings[corpus_start_idx : corpus_start_idx + corpus_chunk_size],
+            )
+
+            # Get top-k scores
+            cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
+                cos_scores, min(top_k, len(cos_scores[0])), dim=1, largest=True, sorted=False
+            )
+            cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
+            cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
+
+            for query_itr in range(len(cos_scores)):
+                for sub_corpus_id, score in zip(cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]):
+                    corpus_id = corpus_start_idx + sub_corpus_id
+                    query_id = query_start_idx + query_itr
+                    if len(queries_result_list[query_id]) < top_k:
+                        heapq.heappush(
+                            queries_result_list[query_id], (score, corpus_id)
+                        )  # heaqp tracks the quantity of the first element in the tuple
+                    else:
+                        heapq.heappushpop(queries_result_list[query_id], (score, corpus_id))
+
+    # change the data format and sort
+    for query_id in range(len(queries_result_list)):
+        for doc_itr in range(len(queries_result_list[query_id])):
+            score, corpus_id = queries_result_list[query_id][doc_itr]
+            queries_result_list[query_id][doc_itr] = {"corpus_id": corpus_id, "score": score}
+        queries_result_list[query_id] = sorted(queries_result_list[query_id], key=lambda x: x["score"], reverse=True)
+
+    return queries_result_list
+
+
+def http_get(url, path) -> None:
+    """
+    Downloads a URL to a given path on disc
+    """
+    if os.path.dirname(path) != "":
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+
+    req = requests.get(url, stream=True)
+    if req.status_code != 200:
+        print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
+        req.raise_for_status()
+        return
+
+    download_filepath = path + "_part"
+    with open(download_filepath, "wb") as file_binary:
+        content_length = req.headers.get("Content-Length")
+        total = int(content_length) if content_length is not None else None
+        progress = tqdm(unit="B", total=total, unit_scale=True)
+        for chunk in req.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
+                progress.update(len(chunk))
+                file_binary.write(chunk)
+
+    os.rename(download_filepath, path)
+    progress.close()
+
+
+def batch_to_device(batch, target_device: device):
+    """
+    send a pytorch batch to a device (CPU/GPU)
+    """
+    for key in batch:
+        if isinstance(batch[key], Tensor):
+            batch[key] = batch[key].to(target_device)
+    return batch
+
+
+def fullname(o) -> str:
+    """
+    Gives a full name (package_name.class_name) for a class / object in Python. Will
+    be used to load the correct classes from JSON files
+    """
+
+    module = o.__class__.__module__
+    if module is None or module == str.__class__.__module__:
+        return o.__class__.__name__  # Avoid reporting __builtin__
+    else:
+        return module + "." + o.__class__.__name__
+
+
+def import_from_string(dotted_path):
+    """
+    Import a dotted module path and return the attribute/class designated by the
+    last name in the path. Raise ImportError if the import failed.
+    """
+    try:
+        module_path, class_name = dotted_path.rsplit(".", 1)
+    except ValueError:
+        msg = "%s doesn't look like a module path" % dotted_path
+        raise ImportError(msg)
+
+    try:
+        module = importlib.import_module(dotted_path)
+    except Exception:
+        module = importlib.import_module(module_path)
+
+    try:
+        return getattr(module, class_name)
+    except AttributeError:
+        msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
+        raise ImportError(msg)
+
+
+def community_detection(
+    embeddings, threshold=0.75, min_community_size=10, batch_size=1024, show_progress_bar=False
+) -> List[List[int]]:
+    """
+    Function for Fast Community Detection
+    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
+    Returns only communities that are larger than min_community_size. The communities are returned
+    in decreasing order. The first element in each list is the central point in the community.
+    """
+    if not isinstance(embeddings, torch.Tensor):
+        embeddings = torch.tensor(embeddings)
+
+    threshold = torch.tensor(threshold, device=embeddings.device)
+    embeddings = normalize_embeddings(embeddings)
+
+    extracted_communities = []
+
+    # Maximum size for community
+    min_community_size = min(min_community_size, len(embeddings))
+    sort_max_size = min(max(2 * min_community_size, 50), len(embeddings))
+
+    for start_idx in tqdm(
+        range(0, len(embeddings), batch_size), desc="Finding clusters", disable=not show_progress_bar
+    ):
+        # Compute cosine similarity scores
+        cos_scores = embeddings[start_idx : start_idx + batch_size] @ embeddings.T
+
+        # Use a torch-heavy approach if the embeddings are on CUDA, otherwise a loop-heavy one
+        if embeddings.device.type in ["cuda", "npu"]:
+            # Threshold the cos scores and determine how many close embeddings exist per embedding
+            threshold_mask = cos_scores >= threshold
+            row_wise_count = threshold_mask.sum(1)
+
+            # Only consider embeddings with enough close other embeddings
+            large_enough_mask = row_wise_count >= min_community_size
+            if not large_enough_mask.any():
+                continue
+
+            row_wise_count = row_wise_count[large_enough_mask]
+            cos_scores = cos_scores[large_enough_mask]
+
+            # The max is the largest potential community, so we use that in topk
+            k = row_wise_count.max()
+            _, top_k_indices = cos_scores.topk(k=k, largest=True)
+
+            # Use the row-wise count to slice the indices
+            for count, indices in zip(row_wise_count, top_k_indices):
+                extracted_communities.append(indices[:count].tolist())
+        else:
+            # Minimum size for a community
+            top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)
+
+            # Filter for rows >= min_threshold
+            for i in range(len(top_k_values)):
+                if top_k_values[i][-1] >= threshold:
+                    # Only check top k most similar entries
+                    top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
+
+                    # Check if we need to increase sort_max_size
+                    while top_val_large[-1] > threshold and sort_max_size < len(embeddings):
+                        sort_max_size = min(2 * sort_max_size, len(embeddings))
+                        top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
+
+                    extracted_communities.append(top_idx_large[top_val_large >= threshold].tolist())
+
+    # Largest cluster first
+    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)
+
+    # Step 2) Remove overlapping communities
+    unique_communities = []
+    extracted_ids = set()
+
+    for cluster_id, community in enumerate(extracted_communities):
+        non_overlapped_community = []
+        for idx in community:
+            if idx not in extracted_ids:
+                non_overlapped_community.append(idx)
+
+        if len(non_overlapped_community) >= min_community_size:
+            unique_communities.append(non_overlapped_community)
+            extracted_ids.update(non_overlapped_community)
+
+    unique_communities = sorted(unique_communities, key=lambda x: len(x), reverse=True)
+
+    return unique_communities
+
+
+##################
+#
+######################
+
+
+class disabled_tqdm(tqdm):
+    """
+    Class to override `disable` argument in case progress bars are globally disabled.
+
+    Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324.
+    """
+
+    def __init__(self, *args, **kwargs):
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)
+
+    def __delattr__(self, attr: str) -> None:
+        """Fix for https://github.com/huggingface/huggingface_hub/issues/1603"""
+        try:
+            super().__delattr__(attr)
+        except AttributeError:
+            if attr != "_lock":
+                raise
+
+
+def is_sentence_transformer_model(
+    model_name_or_path: str,
+    token: Optional[Union[bool, str]] = None,
+    cache_folder: Optional[str] = None,
+    revision: Optional[str] = None,
+) -> bool:
+    return bool(load_file_path(model_name_or_path, "modules.json", token, cache_folder, revision=revision))
+
+
+def load_file_path(
+    model_name_or_path: str,
+    filename: str,
+    token: Optional[Union[bool, str]],
+    cache_folder: Optional[str],
+    revision: Optional[str] = None,
+) -> Optional[str]:
+    # If file is local
+    file_path = os.path.join(model_name_or_path, filename)
+    if os.path.exists(file_path):
+        return file_path
+
+    # If file is remote
+    try:
+        return hf_hub_download(
+            model_name_or_path,
+            filename=filename,
+            revision=revision,
+            library_name="sentence-transformers",
+            token=token,
+            cache_dir=cache_folder,
+        )
+    except Exception:
+        return
+
+
+def load_dir_path(
+    model_name_or_path: str,
+    directory: str,
+    token: Optional[Union[bool, str]],
+    cache_folder: Optional[str],
+    revision: Optional[str] = None,
+) -> Optional[str]:
+    # If file is local
+    dir_path = os.path.join(model_name_or_path, directory)
+    if os.path.exists(dir_path):
+        return dir_path
+
+    download_kwargs = {
+        "repo_id": model_name_or_path,
+        "revision": revision,
+        "allow_patterns": f"{directory}/**",
+        "library_name": "sentence-transformers",
+        "token": token,
+        "cache_dir": cache_folder,
+        "tqdm_class": disabled_tqdm,
+    }
+    # Try to download from the remote
+    try:
+        repo_path = snapshot_download(**download_kwargs)
+    except Exception:
+        # Otherwise, try local (i.e. cache) only
+        download_kwargs["local_files_only"] = True
+        repo_path = snapshot_download(**download_kwargs)
+    return os.path.join(repo_path, directory)
+
+
+def save_to_hub_args_decorator(func):
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        # If repo_id not already set, use repo_name
+        repo_name = kwargs.pop("repo_name", None)
+        if repo_name and "repo_id" not in kwargs:
+            logger.warning(
+                "Providing a `repo_name` keyword argument to `save_to_hub` is deprecated, please use `repo_id` instead."
+            )
+            kwargs["repo_id"] = repo_name
+
+        # If positional args are used, adjust for the new "token" keyword argument
+        if len(args) >= 2:
+            args = (*args[:2], None, *args[2:])
+
+        return func(self, *args, **kwargs)
+
+    return wrapper
+
+
+def get_device_name() -> Literal["mps", "cuda", "npu", "hpu", "cpu"]:
+    """
+    Returns the name of the device where this module is running on.
+    It's simple implementation that doesn't cover cases when more powerful GPUs are available and
+    not a primary device ('cuda:0') or MPS device is available, but not configured properly:
+    https://pytorch.org/docs/master/notes/mps.html
+
+    :return: Device name, like 'cuda' or 'cpu'
+    """
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.backends.mps.is_available():
+        return "mps"
+    elif is_torch_npu_available():
+        return "npu"
+    elif importlib.util.find_spec("habana_frameworks") is not None:
+        import habana_frameworks.torch.hpu as hthpu
+
+        if hthpu.is_available():
+            return "hpu"
+    return "cpu"
--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+description_file = README.md
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages
+
+with open("README.md", mode="r", encoding="utf-8") as readme_file:
+    readme = readme_file.read()
+
+
+setup(
+    name="sentence-transformers",
+    version="2.7.0.dev0",
+    author="Nils Reimers",
+    author_email="info@nils-reimers.de",
+    description="Multilingual text embeddings",
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    license="Apache License 2.0",
+    url="https://www.SBERT.net",
+    download_url="https://github.com/UKPLab/sentence-transformers/",
+    packages=find_packages(),
+    python_requires=">=3.8.0",
+    install_requires=[
+        "transformers>=4.32.0,<5.0.0",
+        "tqdm",
+        "torch>=1.11.0",
+        "numpy",
+        "scikit-learn",
+        "scipy",
+        "huggingface-hub>=0.15.1",
+        "Pillow",
+    ],
+    extras_require={
+        "dev": [
+            "pre-commit",
+            "pytest",
+            "ruff>=0.3.0",
+        ],
+    },
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.8",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning",
+)
--- a/tests/conftest.py
+++ b/tests/conftest.py
+import os
+import platform
+import tempfile
+import pytest
+
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from sentence_transformers.models import Transformer, Pooling
+
+
+@pytest.fixture()
+def stsb_bert_tiny_model() -> SentenceTransformer:
+    return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors")
+
+
+@pytest.fixture(scope="session")
+def stsb_bert_tiny_model_reused() -> SentenceTransformer:
+    return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors")
+
+
+@pytest.fixture()
+def paraphrase_distilroberta_base_v1_model() -> SentenceTransformer:
+    return SentenceTransformer("paraphrase-distilroberta-base-v1")
+
+
+@pytest.fixture()
+def distilroberta_base_ce_model() -> CrossEncoder:
+    return CrossEncoder("distilroberta-base", num_labels=1)
+
+
+@pytest.fixture()
+def clip_vit_b_32_model() -> SentenceTransformer:
+    return SentenceTransformer("clip-ViT-B-32")
+
+
+@pytest.fixture()
+def distilbert_base_uncased_model() -> SentenceTransformer:
+    word_embedding_model = Transformer("distilbert-base-uncased")
+    pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension())
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+    return model
+
+
+@pytest.fixture()
+def cache_dir():
+    """
+    In the CI environment, we use a temporary directory as `cache_dir`
+    to avoid keeping the downloaded models on disk after the test.
+
+    This is only required for Ubuntu, as we otherwise have disk space issues there.
+    """
+    if os.environ.get("CI", None) and platform.system() == "Linux":
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            yield tmp_dir
+    else:
+        yield None