Update version according to github

b0f4f53a · Rayyyyy · 392df446 · b0f4f53a · b0f4f53a · b0f4f53a
Commit b0f4f53a authored May 29, 2024 by Rayyyyy
20 changed files
--- a/sentence_transformers/models/Normalize.py
+++ b/sentence_transformers/models/Normalize.py
-from torch import Tensor
-from torch import nn
 from typing import Dict
 import torch.nn.functional as F
+from torch import Tensor, nn
 class Normalize(nn.Module):
-    """
+    """This layer normalizes embeddings to unit length"""
-    This layer normalizes embeddings to unit length
-    """
    def __init__(self):
        super(Normalize, self).__init__()

--- a/sentence_transformers/models/Pooling.py
+++ b/sentence_transformers/models/Pooling.py
-import torch
-from torch import Tensor
-from torch import nn
-from typing import Dict
-import os
 import json
+import os
+from typing import Dict
+import torch
+from torch import Tensor, nn
 class Pooling(nn.Module):
-    """Performs pooling (max or mean) on the token embeddings.
+    """
+    Performs pooling (max or mean) on the token embeddings.
    Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows
    to use the CLS token if it is returned by the underlying word embedding model. You can concatenate multiple poolings
    together.
-    :param word_embedding_dimension: Dimensions for the word embeddings
+    Args:
-    :param pooling_mode: Either "cls", "lasttoken", "max", "mean", "mean_sqrt_len_tokens", or "weightedmean". If set, overwrites the other pooling_mode_* settings
+        word_embedding_dimension: Dimensions for the word embeddings
-    :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
+        pooling_mode: Either "cls", "lasttoken", "max", "mean",
-    :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
+            "mean_sqrt_len_tokens", or "weightedmean". If set,
-    :param pooling_mode_mean_tokens: Perform mean-pooling
+            overwrites the other pooling_mode_* settings
-    :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but divide by sqrt(input_length).
+        pooling_mode_cls_token: Use the first token (CLS token) as text
-    :param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling. See `SGPT: GPT Sentence Embeddings for Semantic Search <https://arxiv.org/abs/2202.08904>`_.
+            representations
-    :param pooling_mode_lasttoken: Perform last token pooling. See `SGPT: GPT Sentence Embeddings for Semantic Search <https://arxiv.org/abs/2202.08904>`_ and `Text and Code Embeddings by Contrastive Pre-Training <https://arxiv.org/abs/2201.10005>`_.
+        pooling_mode_max_tokens: Use max in each dimension over all
+            tokens.
+        pooling_mode_mean_tokens: Perform mean-pooling
+        pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but
+            divide by sqrt(input_length).
+        pooling_mode_weightedmean_tokens: Perform (position) weighted
+            mean pooling. See `SGPT: GPT Sentence Embeddings for
+            Semantic Search <https://arxiv.org/abs/2202.08904>`_.
+        pooling_mode_lasttoken: Perform last token pooling. See `SGPT:
+            GPT Sentence Embeddings for Semantic Search
+            <https://arxiv.org/abs/2202.08904>`_ and `Text and Code
+            Embeddings by Contrastive Pre-Training
+            <https://arxiv.org/abs/2201.10005>`_.
    """
    POOLING_MODES = (
@@ -98,9 +111,7 @@ class Pooling(nn.Module):
        return "Pooling({})".format(self.get_config_dict())
    def get_pooling_mode_str(self) -> str:
-        """
+        """Returns the pooling mode as string"""
-        Returns the pooling mode as string
-        """
        modes = []
        if self.pooling_mode_cls_token:
            modes.append("cls")
@@ -209,7 +220,7 @@ class Pooling(nn.Module):
            output_vectors.append(embedding)
        output_vector = torch.cat(output_vectors, 1)
-        features.update({"sentence_embedding": output_vector})
+        features["sentence_embedding"] = output_vector
        return features
    def get_sentence_embedding_dimension(self):

--- a/sentence_transformers/models/Transformer.py
+++ b/sentence_transformers/models/Transformer.py
-from torch import nn
-from transformers import AutoModel, AutoTokenizer, AutoConfig, T5Config, MT5Config
 import json
-from typing import List, Dict, Optional, Union, Tuple
 import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+from torch import nn
+from transformers import AutoConfig, AutoModel, AutoTokenizer, MT5Config, T5Config
 class Transformer(nn.Module):
    """Huggingface AutoModel to generate token embeddings.
    Loads the correct class, e.g. BERT / RoBERTa etc.
-    :param model_name_or_path: Huggingface models name (https://huggingface.co/models)
+    Args:
-    :param max_seq_length: Truncate any inputs longer than max_seq_length
+        model_name_or_path: Huggingface models name
-    :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model
+            (https://huggingface.co/models)
-    :param cache_dir: Cache dir for Huggingface Transformers to store/load models
+        max_seq_length: Truncate any inputs longer than max_seq_length
-    :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
+        model_args: Keyword arguments passed to the Huggingface
-    :param do_lower_case: If true, lowercases the input (independent if the model is cased or not)
+            Transformers model
-    :param tokenizer_name_or_path: Name or path of the tokenizer. When None, then model_name_or_path is used
+        tokenizer_args: Keyword arguments passed to the Huggingface
+            Transformers tokenizer
+        config_args: Keyword arguments passed to the Huggingface
+            Transformers config
+        cache_dir: Cache dir for Huggingface Transformers to store/load
+            models
+        do_lower_case: If true, lowercases the input (independent if the
+            model is cased or not)
+        tokenizer_name_or_path: Name or path of the tokenizer. When
+            None, then model_name_or_path is used
    """
    def __init__(
        self,
        model_name_or_path: str,
        max_seq_length: Optional[int] = None,
-        model_args: Dict = {},
+        model_args: Optional[Dict[str, Any]] = None,
+        tokenizer_args: Optional[Dict[str, Any]] = None,
+        config_args: Optional[Dict[str, Any]] = None,
        cache_dir: Optional[str] = None,
-        tokenizer_args: Dict = {},
        do_lower_case: bool = False,
        tokenizer_name_or_path: str = None,
    ):
        super(Transformer, self).__init__()
        self.config_keys = ["max_seq_length", "do_lower_case"]
        self.do_lower_case = do_lower_case
+        if model_args is None:
-        config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
+            model_args = {}
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        if config_args is None:
+            config_args = {}
+        config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
        self._load_model(model_name_or_path, config, cache_dir, **model_args)
+        if max_seq_length is not None and "model_max_length" not in tokenizer_args:
+            tokenizer_args["model_max_length"] = max_seq_length
        self.tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
            cache_dir=cache_dir,
@@ -114,9 +133,7 @@ class Transformer(nn.Module):
        return self.auto_model.config.hidden_size
    def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]], padding: Union[str, bool] = True):
-        """
+        """Tokenizes a text and maps tokens to token-ids"""
-        Tokenizes a text and maps tokens to token-ids
-        """
        output = {}
        if isinstance(texts[0], str):
            to_tokenize = [texts]
@@ -182,6 +199,10 @@ class Transformer(nn.Module):
        with open(sbert_config_path) as fIn:
            config = json.load(fIn)
        # Don't allow configs to set trust_remote_code
-        if "model_args" in config:
+        if "model_args" in config and "trust_remote_code" in config["model_args"]:
            config["model_args"].pop("trust_remote_code")
+        if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
+            config["tokenizer_args"].pop("trust_remote_code")
+        if "config_args" in config and "trust_remote_code" in config["config_args"]:
+            config["config_args"].pop("trust_remote_code")
        return Transformer(model_name_or_path=input_path, **config)
--- a/sentence_transformers/models/WeightedLayerPooling.py
+++ b/sentence_transformers/models/WeightedLayerPooling.py
-import torch
-from torch import Tensor
-from torch import nn
-from typing import Dict
-import os
 import json
+import os
+from typing import Dict
+import torch
+from torch import Tensor, nn
 class WeightedLayerPooling(nn.Module):
-    """
+    """Token embeddings are weighted mean of their different hidden layer representations"""
-    Token embeddings are weighted mean of their different hidden layer representations
-    """
    def __init__(
        self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights=None

--- a/sentence_transformers/models/WordEmbeddings.py
+++ b/sentence_transformers/models/WordEmbeddings.py
+import gzip
+import json
+import logging
+import os
+from typing import List
+import numpy as np
 import torch
 from torch import nn
-from typing import List
-import logging
-import gzip
 from tqdm import tqdm
-import numpy as np
-import os
-import json
-from ..util import import_from_string, fullname, http_get
-from .tokenizer import WordTokenizer, WhitespaceTokenizer
+from sentence_transformers.util import fullname, http_get, import_from_string
+from .tokenizer import WhitespaceTokenizer, WordTokenizer
 logger = logging.getLogger(__name__)

--- a/sentence_transformers/models/WordWeights.py
+++ b/sentence_transformers/models/WordWeights.py
-import torch
-from torch import Tensor
-from torch import nn
-from typing import List, Dict
-import os
 import json
 import logging
+import os
+from typing import Dict, List
+import torch
+from torch import Tensor, nn
 logger = logging.getLogger(__name__)
@@ -15,13 +14,14 @@ class WordWeights(nn.Module):
    def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
        """
+        Initializes the WordWeights class.
-        :param vocab:
-            Vocabulary of the tokenizer
+        Args:
-        :param word_weights:
+            vocab (List[str]): Vocabulary of the tokenizer.
-            Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
+            word_weights (Dict[str, float]): Mapping of tokens to a float weight value. Word embeddings are multiplied
-        :param unknown_word_weight:
+                by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values).
-            Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
+            unknown_word_weight (float, optional): Weight for words in vocab that do not appear in the word_weights lookup.
+                These can be, for example, rare words in the vocab where no weight exists. Defaults to 1.
        """
        super(WordWeights, self).__init__()
        self.config_keys = ["vocab", "word_weights", "unknown_word_weight"]

--- a/sentence_transformers/models/__init__.py
+++ b/sentence_transformers/models/__init__.py
-from .Transformer import Transformer
 from .Asym import Asym
 from .BoW import BoW
+from .CLIPModel import CLIPModel
 from .CNN import CNN
 from .Dense import Dense
 from .Dropout import Dropout
@@ -8,10 +8,10 @@ from .LayerNorm import LayerNorm
 from .LSTM import LSTM
 from .Normalize import Normalize
 from .Pooling import Pooling
+from .Transformer import Transformer
 from .WeightedLayerPooling import WeightedLayerPooling
 from .WordEmbeddings import WordEmbeddings
 from .WordWeights import WordWeights
-from .CLIPModel import CLIPModel
 __all__ = [
    "Transformer",

--- a/sentence_transformers/models/tokenizer/PhraseTokenizer.py
+++ b/sentence_transformers/models/tokenizer/PhraseTokenizer.py
-from typing import List, Iterable
 import collections
-import string
-import os
 import json
 import logging
-from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
+import os
-from transformers.utils.import_utils import is_nltk_available, NLTK_IMPORT_ERROR
+import string
+from typing import Iterable, List
+from transformers.utils.import_utils import NLTK_IMPORT_ERROR, is_nltk_available
+from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
 logger = logging.getLogger(__name__)

--- a/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py
+++ b/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py
-from typing import List, Iterable
 import collections
-import string
-import os
 import json
-from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
+import os
+import string
+from typing import Iterable, List
+from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
 class WhitespaceTokenizer(WordTokenizer):

--- a/sentence_transformers/models/tokenizer/WordTokenizer.py
+++ b/sentence_transformers/models/tokenizer/WordTokenizer.py
 from abc import ABC, abstractmethod
-from typing import List, Iterable
+from typing import Iterable, List
 ENGLISH_STOP_WORDS = [
    "!",

--- a/sentence_transformers/models/tokenizer/__init__.py
+++ b/sentence_transformers/models/tokenizer/__init__.py
-from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
-from .WhitespaceTokenizer import WhitespaceTokenizer
 from .PhraseTokenizer import PhraseTokenizer
+from .WhitespaceTokenizer import WhitespaceTokenizer
+from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
 __all__ = ["WordTokenizer", "WhitespaceTokenizer", "PhraseTokenizer", "ENGLISH_STOP_WORDS"]
--- a/sentence_transformers/quantization.py
+++ b/sentence_transformers/quantization.py
-import time
-from torch import Tensor
-from typing import List, Literal, Tuple, TYPE_CHECKING
-import numpy as np
 import logging
-from typing import Dict, Optional, Union
+import time
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+import numpy as np
+from torch import Tensor
 logger = logging.getLogger(__name__)
@@ -37,31 +36,53 @@ def semantic_search_faiss(
    Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
    keep `top_k`.
-    :param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
+    Args:
-    :param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        query_embeddings: Embeddings of the query sentences. Ideally not
-        be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
+            quantized to allow for rescoring.
-    :param corpus_index: FAISS index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        corpus_embeddings: Embeddings of the corpus sentences. Either
-        be used, not both.
+            `corpus_embeddings` or `corpus_index` should be used, not
-    :param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
+            both. The embeddings can be quantized to "int8" or "binary"
-        Default is "float32".
+            for more efficient search.
-    :param top_k: Number of top results to retrieve. Default is 10.
+        corpus_index: FAISS index for the corpus sentences. Either
-    :param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
+            `corpus_embeddings` or `corpus_index` should be used, not
-        refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
+            both.
-        Default is None, which means that the ranges will be calculated from the calibration embeddings.
+        corpus_precision: Precision of the corpus embeddings. The
-    :param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
+            options are "float32", "int8", or "binary". Default is
-        quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
+            "float32".
-        values for each dimension. Default is None, which means that the ranges will be calculated from the query
+        top_k: Number of top results to retrieve. Default is 10.
-        embeddings. This is not recommended.
+        ranges: Ranges for quantization of embeddings. This is only used
-    :param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
+            for int8 quantization, where the ranges refers to the
-        are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
+            minimum and maximum values for each dimension. So, it's a 2D
-    :param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
+            array with shape (2, embedding_dim). Default is None, which
-        and then rescore to only keep `top_k`. Default is 2.
+            means that the ranges will be calculated from the
-    :param exact: Whether to use exact search or approximate search. Default is True.
+            calibration embeddings.
-    :param output_index: Whether to output the FAISS index used for the search. Default is False.
+        calibration_embeddings: Embeddings used for calibration during
+            quantization. This is only used for int8 quantization, where
-    :return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
+            the calibration embeddings can be used to compute ranges,
-        the tuple will also contain the FAISS index used for the search.
+            i.e. the minimum and maximum values for each dimension.
-    :raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
+            Default is None, which means that the ranges will be
+            calculated from the query embeddings. This is not
+            recommended.
+        rescore: Whether to perform rescoring. Note that rescoring still
+            will only be used if the query embeddings are not quantized
+            and the corpus is quantized, i.e. the corpus precision is
+            not "float32". Default is True.
+        rescore_multiplier: Oversampling factor for rescoring. The code
+            will now search `top_k * rescore_multiplier` samples and
+            then rescore to only keep `top_k`. Default is 2.
+        exact: Whether to use exact search or approximate search.
+            Default is True.
+        output_index: Whether to output the FAISS index used for the
+            search. Default is False.
+    Returns:
+        A tuple containing a list of search results and the time taken
+        for the search. If `output_index` is True, the tuple will also
+        contain the FAISS index used for the search.
+    Raises:
+        ValueError: If both `corpus_embeddings` and `corpus_index` are
+            provided or if neither is provided.
    The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
    The time taken for the search is a float value.
@@ -182,37 +203,59 @@ def semantic_search_usearch(
    Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
    keep `top_k`.
-    :param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
+    Args:
-    :param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        query_embeddings: Embeddings of the query sentences. Ideally not
-        be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
+            quantized to allow for rescoring.
-    :param corpus_index: usearch index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
+        corpus_embeddings: Embeddings of the corpus sentences. Either
-        be used, not both.
+            `corpus_embeddings` or `corpus_index` should be used, not
-    :param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
+            both. The embeddings can be quantized to "int8" or "binary"
-        Default is "float32".
+            for more efficient search.
-    :param top_k: Number of top results to retrieve. Default is 10.
+        corpus_index: usearch index for the corpus sentences. Either
-    :param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
+            `corpus_embeddings` or `corpus_index` should be used, not
-        refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
+            both.
-        Default is None, which means that the ranges will be calculated from the calibration embeddings.
+        corpus_precision: Precision of the corpus embeddings. The
-    :param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
+            options are "float32", "int8", or "binary". Default is
-        quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
+            "float32".
-        values for each dimension. Default is None, which means that the ranges will be calculated from the query
+        top_k: Number of top results to retrieve. Default is 10.
-        embeddings. This is not recommended.
+        ranges: Ranges for quantization of embeddings. This is only used
-    :param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
+            for int8 quantization, where the ranges refers to the
-        are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
+            minimum and maximum values for each dimension. So, it's a 2D
-    :param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
+            array with shape (2, embedding_dim). Default is None, which
-        and then rescore to only keep `top_k`. Default is 2.
+            means that the ranges will be calculated from the
-    :param exact: Whether to use exact search or approximate search. Default is True.
+            calibration embeddings.
-    :param output_index: Whether to output the usearch index used for the search. Default is False.
+        calibration_embeddings: Embeddings used for calibration during
+            quantization. This is only used for int8 quantization, where
-    :return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
+            the calibration embeddings can be used to compute ranges,
-        the tuple will also contain the usearch index used for the search.
+            i.e. the minimum and maximum values for each dimension.
-    :raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
+            Default is None, which means that the ranges will be
+            calculated from the query embeddings. This is not
+            recommended.
+        rescore: Whether to perform rescoring. Note that rescoring still
+            will only be used if the query embeddings are not quantized
+            and the corpus is quantized, i.e. the corpus precision is
+            not "float32". Default is True.
+        rescore_multiplier: Oversampling factor for rescoring. The code
+            will now search `top_k * rescore_multiplier` samples and
+            then rescore to only keep `top_k`. Default is 2.
+        exact: Whether to use exact search or approximate search.
+            Default is True.
+        output_index: Whether to output the usearch index used for the
+            search. Default is False.
+    Returns:
+        A tuple containing a list of search results and the time taken
+        for the search. If `output_index` is True, the tuple will also
+        contain the usearch index used for the search.
+    Raises:
+        ValueError: If both `corpus_embeddings` and `corpus_index` are
+            provided or if neither is provided.
    The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
    The time taken for the search is a float value.
    """
-    from usearch.index import Index
    from usearch.compiled import ScalarKind
+    from usearch.index import Index
    if corpus_embeddings is not None and corpus_index is not None:
        raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.")
@@ -239,7 +282,7 @@ def semantic_search_usearch(
            corpus_index = Index(
                ndim=corpus_embeddings.shape[1],
                metric="hamming",
-                dtype="i8",
+                dtype="b1",
            )
        corpus_index.add(np.arange(len(corpus_embeddings)), corpus_embeddings)
@@ -327,18 +370,27 @@ def quantize_embeddings(
    Quantizes embeddings to a lower precision. This can be used to reduce the memory footprint and increase the
    speed of similarity search. The supported precisions are "float32", "int8", "uint8", "binary", and "ubinary".
-    :param embeddings: Unquantized (e.g. float) embeddings with to quantize to a given precision
+    Args:
-    :param precision: The precision to convert to. Options are "float32", "int8", "uint8", "binary", "ubinary".
+        embeddings: Unquantized (e.g. float) embeddings with to quantize
-    :param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
+            to a given precision
-        refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
+        precision: The precision to convert to. Options are "float32",
-        Default is None, which means that the ranges will be calculated from the calibration embeddings.
+            "int8", "uint8", "binary", "ubinary".
-    :type ranges: Optional[np.ndarray]
+        ranges (Optional[np.ndarray]): Ranges for quantization of
-    :param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
+            embeddings. This is only used for int8 quantization, where
-        quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
+            the ranges refers to the minimum and maximum values for each
-        values for each dimension. Default is None, which means that the ranges will be calculated from the query
+            dimension. So, it's a 2D array with shape (2,
-        embeddings. This is not recommended.
+            embedding_dim). Default is None, which means that the ranges
-    :type calibration_embeddings: Optional[np.ndarray]
+            will be calculated from the calibration embeddings.
-    :return: Quantized embeddings with the specified precision
+        calibration_embeddings (Optional[np.ndarray]): Embeddings used
+            for calibration during quantization. This is only used for
+            int8 quantization, where the calibration embeddings can be
+            used to compute ranges, i.e. the minimum and maximum values
+            for each dimension. Default is None, which means that the
+            ranges will be calculated from the query embeddings. This is
+            not recommended.
+    Returns:
+        Quantized embeddings with the specified precision
    """
    if isinstance(embeddings, Tensor):
        embeddings = embeddings.cpu().numpy()

--- a/sentence_transformers/readers/InputExample.py
+++ b/sentence_transformers/readers/InputExample.py
-from typing import Union, List
+from typing import List, Union
 class InputExample:
-    """
+    """Structure for one input example with texts, the label and a unique id"""
-    Structure for one input example with texts, the label and a unique id
-    """
    def __init__(self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0):
        """
        Creates one InputExample with the given texts, guid and label
+        Args:
-        :param guid
+            guid: id for the example
-            id for the example
+            texts: the texts for the example.
-        :param texts
+            label: the label for the example
-            the texts for the example.
-        :param label
-            the label for the example
        """
        self.guid = guid
        self.texts = texts

--- a/sentence_transformers/readers/LabelSentenceReader.py
+++ b/sentence_transformers/readers/LabelSentenceReader.py
-from . import InputExample
 import os
+from . import InputExample
 class LabelSentenceReader:
    """Reads in a file that has at least two columns: a label and a sentence.
    This reader can for example be used with the BatchHardTripletLoss.
-    Maps labels automatically to integers"""
+    Maps labels automatically to integers
+    """
    def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator="\t"):
        self.folder = folder

--- a/sentence_transformers/readers/NLIDataReader.py
+++ b/sentence_transformers/readers/NLIDataReader.py
-from . import InputExample
 import gzip
 import os
+from . import InputExample
 class NLIDataReader(object):
-    """
+    """Reads in the Stanford NLI dataset and the MultiGenre NLI dataset"""
-    Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
-    """
    def __init__(self, dataset_folder):
        self.dataset_folder = dataset_folder

--- a/sentence_transformers/readers/PairedFilesReader.py
+++ b/sentence_transformers/readers/PairedFilesReader.py
-from . import InputExample
 import gzip
+from . import InputExample
 class PairedFilesReader(object):
-    """
+    """Reads in the a Pair Dataset, split in two files"""
-    Reads in the a Pair Dataset, split in two files
-    """
    def __init__(self, filepaths):
        self.filepaths = filepaths
    def get_examples(self, max_examples=0):
-        """ """
        fIns = []
        for filepath in self.filepaths:
            fIn = (

--- a/sentence_transformers/readers/STSDataReader.py
+++ b/sentence_transformers/readers/STSDataReader.py
-from . import InputExample
 import csv
 import gzip
 import os
+from . import InputExample
 class STSDataReader:
-    """
+    """Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
-    Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
    Default values expects a tab separated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
    """
@@ -34,9 +34,7 @@ class STSDataReader:
        self.max_score = max_score
    def get_examples(self, filename, max_examples=0):
-        """
+        """filename specified which data split to use (train.csv, dev.csv, test.csv)."""
-        filename specified which data split to use (train.csv, dev.csv, test.csv).
-        """
        filepath = os.path.join(self.dataset_folder, filename)
        with gzip.open(filepath, "rt", encoding="utf8") if filename.endswith(".gz") else open(
            filepath, encoding="utf-8"
@@ -59,8 +57,7 @@ class STSDataReader:
 class STSBenchmarkDataReader(STSDataReader):
-    """
+    """Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
-    Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
    Scores are normalized from 0...5 to 0...1
    """

--- a/sentence_transformers/readers/TripletReader.py
+++ b/sentence_transformers/readers/TripletReader.py
-from . import InputExample
 import csv
 import os
+from . import InputExample
 class TripletReader(object):
-    """
+    """Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
-    Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
    one positive example (s2) and one negative example (s3)
    """
@@ -28,7 +28,6 @@ class TripletReader(object):
        self.quoting = quoting
    def get_examples(self, filename, max_examples=0):
-        """ """
        data = csv.reader(
            open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
            delimiter=self.delimiter,

--- a/sentence_transformers/readers/__init__.py
+++ b/sentence_transformers/readers/__init__.py
 from .InputExample import InputExample
 from .LabelSentenceReader import LabelSentenceReader
 from .NLIDataReader import NLIDataReader
-from .STSDataReader import STSDataReader, STSBenchmarkDataReader
+from .STSDataReader import STSBenchmarkDataReader, STSDataReader
 from .TripletReader import TripletReader
 __all__ = [

--- a/sentence_transformers/sampler.py
+++ b/sentence_transformers/sampler.py
+import logging
+from collections import defaultdict
+from itertools import accumulate, cycle
+from typing import List
+import torch
+from torch.utils.data import BatchSampler, ConcatDataset, SubsetRandomSampler
+from sentence_transformers.util import is_datasets_available
+if is_datasets_available():
+    from datasets import Dataset
+logger = logging.getLogger(__name__)
+class SetEpochMixin:
+    """
+    Required for a BatchSampler as the Trainer will call set_epoch on the BatchSampler at the beginning of each epoch.
+    The BatchSampler can then set the generator seed accordingly.
+    """
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.epoch = 0
+    def set_epoch(self, epoch: int):
+        self.epoch = epoch
+class DefaultBatchSampler(SetEpochMixin, BatchSampler):
+    pass
+class GroupByLabelBatchSampler(SetEpochMixin, BatchSampler):
+    def __init__(
+        self,
+        dataset: "Dataset",
+        batch_size: int,
+        drop_last: bool,
+        valid_label_columns: List[str] = None,
+        generator: torch.Generator = None,
+        seed: int = 0,
+    ):
+        super().__init__(dataset, batch_size, drop_last)
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.generator = generator
+        self.seed = seed
+        if self.batch_size % 2 == 1:
+            raise ValueError("The batch size for `GroupByLabelBatchSampler` must be divisible by 2.")
+        for column_name in valid_label_columns or []:
+            if column_name in dataset.column_names:
+                labels = dataset["label"]
+                break
+        else:
+            raise ValueError(f"None of the valid_label_columns {valid_label_columns} are in the dataset.")
+        del dataset
+        groups = defaultdict(list)
+        for sample_idx, label in enumerate(labels):
+            groups[label].append(sample_idx)
+        self.groups = {
+            label: sample_indices[:num_samples]
+            for label, sample_indices in groups.items()
+            if (num_samples := len(sample_indices) // 2)
+        }
+    def __iter__(self):
+        if self.generator and self.seed:
+            self.generator.manual_seed(self.seed + self.epoch)
+        labels = list(self.groups.keys())
+        partial_batch = []
+        for label_idx in torch.randperm(len(self.groups), generator=self.generator):
+            label = labels[label_idx]
+            samples = self.groups[label]
+            partial_batch.extend(samples)
+            while len(partial_batch) >= self.batch_size:
+                yield partial_batch[: self.batch_size]
+                partial_batch = partial_batch[self.batch_size :]
+        if not self.drop_last and partial_batch:
+            yield partial_batch
+class NoDuplicatesBatchSampler(SetEpochMixin, BatchSampler):
+    def __init__(
+        self,
+        dataset: "Dataset",
+        batch_size: int,
+        drop_last: bool,
+        valid_label_columns: List[str] = [],
+        generator: torch.Generator = None,
+        seed: int = 0,
+    ):
+        super().__init__(dataset, batch_size, drop_last)
+        if label_columns := set(dataset.column_names) & (set(valid_label_columns) | {"dataset_name"}):
+            dataset = dataset.remove_columns(label_columns)
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.generator = generator
+        self.seed = seed
+    def __iter__(self):
+        """
+        Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the
+        batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield
+        the batch indices and continue with the next batch.
+        """
+        if self.generator and self.seed:
+            self.generator.manual_seed(self.seed + self.epoch)
+        remaining_indices = set(torch.randperm(len(self.dataset), generator=self.generator).tolist())
+        while remaining_indices:
+            batch_values = set()
+            batch_indices = []
+            for index in remaining_indices:
+                sample_values = set(self.dataset[index].values())
+                if sample_values & batch_values:
+                    continue
+                batch_indices.append(index)
+                if len(batch_indices) == self.batch_size:
+                    yield batch_indices
+                    break
+                batch_values.update(sample_values)
+            else:
+                # NOTE: some indices might still have been ignored here
+                if not self.drop_last:
+                    yield batch_indices
+            remaining_indices -= set(batch_indices)
+    def __len__(self) -> int:
+        if self.drop_last:
+            return len(self.dataset) // self.batch_size
+        else:
+            return (len(self.dataset) + self.batch_size - 1) // self.batch_size
+class RoundRobinBatchSampler(SetEpochMixin, BatchSampler):
+    def __init__(
+        self,
+        dataset: ConcatDataset,
+        batch_samplers: List[BatchSampler],
+        generator: torch.Generator,
+        seed: int,
+    ):
+        super().__init__(dataset, batch_samplers[0].batch_size, batch_samplers[0].drop_last)
+        self.dataset = dataset
+        self.batch_samplers = batch_samplers
+        self.generator = generator
+        self.seed = seed
+    def __iter__(self):
+        self.generator.manual_seed(self.seed + self.epoch)
+        num_samples = [len(dataset) for dataset in self.dataset.datasets]
+        sample_offsets = [0] + list(accumulate(num_samples))
+        batch_samplers = [iter(sampler) for sampler in self.batch_samplers]
+        for dataset_idx in cycle(range(len(batch_samplers))):
+            sample_offset = sample_offsets[dataset_idx]
+            try:
+                yield [idx + sample_offset for idx in next(batch_samplers[dataset_idx])]
+            except StopIteration:
+                # current iterator is apparently exhausted
+                break
+    def __len__(self) -> int:
+        return min([len(sampler) for sampler in self.batch_samplers]) * len(self.batch_samplers)
+class ProportionalBatchSampler(SetEpochMixin, BatchSampler):
+    def __init__(
+        self,
+        dataset: ConcatDataset,
+        batch_samplers: List[BatchSampler],
+        generator: torch.Generator,
+        seed: int,
+    ):
+        super().__init__(dataset, batch_samplers[0].batch_size, batch_samplers[0].drop_last)
+        self.dataset = dataset
+        self.batch_samplers = batch_samplers
+        self.generator = generator
+        self.seed = seed
+    def __iter__(self):
+        self.generator.manual_seed(self.seed + self.epoch)
+        num_samples = [len(dataset) for dataset in self.dataset.datasets]
+        sample_offsets = [0] + list(accumulate(num_samples))
+        num_batches = [len(sampler) for sampler in self.batch_samplers]
+        dataset_indices = [idx for idx, length in enumerate(num_batches) for _ in range(length)]
+        dataset_idx_sampler = SubsetRandomSampler(dataset_indices, generator=self.generator)
+        batch_samplers = [iter(sampler) for sampler in self.batch_samplers]
+        for dataset_idx in dataset_idx_sampler:
+            sample_offset = sample_offsets[dataset_idx]
+            yield [idx + sample_offset for idx in next(batch_samplers[dataset_idx])]
+    def __len__(self) -> int:
+        return sum([len(sampler) for sampler in self.batch_samplers])