Commit b0f4f53a authored by Rayyyyy's avatar Rayyyyy
Browse files

Update version according to github

parent 392df446
from torch import Tensor
from torch import nn
from typing import Dict
import torch.nn.functional as F
from torch import Tensor, nn
class Normalize(nn.Module):
"""
This layer normalizes embeddings to unit length
"""
"""This layer normalizes embeddings to unit length"""
def __init__(self):
super(Normalize, self).__init__()
......
import torch
from torch import Tensor
from torch import nn
from typing import Dict
import os
import json
import os
from typing import Dict
import torch
from torch import Tensor, nn
class Pooling(nn.Module):
"""Performs pooling (max or mean) on the token embeddings.
"""
Performs pooling (max or mean) on the token embeddings.
Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows
to use the CLS token if it is returned by the underlying word embedding model. You can concatenate multiple poolings
together.
:param word_embedding_dimension: Dimensions for the word embeddings
:param pooling_mode: Either "cls", "lasttoken", "max", "mean", "mean_sqrt_len_tokens", or "weightedmean". If set, overwrites the other pooling_mode_* settings
:param pooling_mode_cls_token: Use the first token (CLS token) as text representations
:param pooling_mode_max_tokens: Use max in each dimension over all tokens.
:param pooling_mode_mean_tokens: Perform mean-pooling
:param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but divide by sqrt(input_length).
:param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling. See `SGPT: GPT Sentence Embeddings for Semantic Search <https://arxiv.org/abs/2202.08904>`_.
:param pooling_mode_lasttoken: Perform last token pooling. See `SGPT: GPT Sentence Embeddings for Semantic Search <https://arxiv.org/abs/2202.08904>`_ and `Text and Code Embeddings by Contrastive Pre-Training <https://arxiv.org/abs/2201.10005>`_.
Args:
word_embedding_dimension: Dimensions for the word embeddings
pooling_mode: Either "cls", "lasttoken", "max", "mean",
"mean_sqrt_len_tokens", or "weightedmean". If set,
overwrites the other pooling_mode_* settings
pooling_mode_cls_token: Use the first token (CLS token) as text
representations
pooling_mode_max_tokens: Use max in each dimension over all
tokens.
pooling_mode_mean_tokens: Perform mean-pooling
pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but
divide by sqrt(input_length).
pooling_mode_weightedmean_tokens: Perform (position) weighted
mean pooling. See `SGPT: GPT Sentence Embeddings for
Semantic Search <https://arxiv.org/abs/2202.08904>`_.
pooling_mode_lasttoken: Perform last token pooling. See `SGPT:
GPT Sentence Embeddings for Semantic Search
<https://arxiv.org/abs/2202.08904>`_ and `Text and Code
Embeddings by Contrastive Pre-Training
<https://arxiv.org/abs/2201.10005>`_.
"""
POOLING_MODES = (
......@@ -98,9 +111,7 @@ class Pooling(nn.Module):
return "Pooling({})".format(self.get_config_dict())
def get_pooling_mode_str(self) -> str:
"""
Returns the pooling mode as string
"""
"""Returns the pooling mode as string"""
modes = []
if self.pooling_mode_cls_token:
modes.append("cls")
......@@ -209,7 +220,7 @@ class Pooling(nn.Module):
output_vectors.append(embedding)
output_vector = torch.cat(output_vectors, 1)
features.update({"sentence_embedding": output_vector})
features["sentence_embedding"] = output_vector
return features
def get_sentence_embedding_dimension(self):
......
from torch import nn
from transformers import AutoModel, AutoTokenizer, AutoConfig, T5Config, MT5Config
import json
from typing import List, Dict, Optional, Union, Tuple
import os
from typing import Any, Dict, List, Optional, Tuple, Union
from torch import nn
from transformers import AutoConfig, AutoModel, AutoTokenizer, MT5Config, T5Config
class Transformer(nn.Module):
"""Huggingface AutoModel to generate token embeddings.
Loads the correct class, e.g. BERT / RoBERTa etc.
:param model_name_or_path: Huggingface models name (https://huggingface.co/models)
:param max_seq_length: Truncate any inputs longer than max_seq_length
:param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model
:param cache_dir: Cache dir for Huggingface Transformers to store/load models
:param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
:param do_lower_case: If true, lowercases the input (independent if the model is cased or not)
:param tokenizer_name_or_path: Name or path of the tokenizer. When None, then model_name_or_path is used
Args:
model_name_or_path: Huggingface models name
(https://huggingface.co/models)
max_seq_length: Truncate any inputs longer than max_seq_length
model_args: Keyword arguments passed to the Huggingface
Transformers model
tokenizer_args: Keyword arguments passed to the Huggingface
Transformers tokenizer
config_args: Keyword arguments passed to the Huggingface
Transformers config
cache_dir: Cache dir for Huggingface Transformers to store/load
models
do_lower_case: If true, lowercases the input (independent if the
model is cased or not)
tokenizer_name_or_path: Name or path of the tokenizer. When
None, then model_name_or_path is used
"""
def __init__(
self,
model_name_or_path: str,
max_seq_length: Optional[int] = None,
model_args: Dict = {},
model_args: Optional[Dict[str, Any]] = None,
tokenizer_args: Optional[Dict[str, Any]] = None,
config_args: Optional[Dict[str, Any]] = None,
cache_dir: Optional[str] = None,
tokenizer_args: Dict = {},
do_lower_case: bool = False,
tokenizer_name_or_path: str = None,
):
super(Transformer, self).__init__()
self.config_keys = ["max_seq_length", "do_lower_case"]
self.do_lower_case = do_lower_case
config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
if model_args is None:
model_args = {}
if tokenizer_args is None:
tokenizer_args = {}
if config_args is None:
config_args = {}
config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
self._load_model(model_name_or_path, config, cache_dir, **model_args)
if max_seq_length is not None and "model_max_length" not in tokenizer_args:
tokenizer_args["model_max_length"] = max_seq_length
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
cache_dir=cache_dir,
......@@ -114,9 +133,7 @@ class Transformer(nn.Module):
return self.auto_model.config.hidden_size
def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]], padding: Union[str, bool] = True):
"""
Tokenizes a text and maps tokens to token-ids
"""
"""Tokenizes a text and maps tokens to token-ids"""
output = {}
if isinstance(texts[0], str):
to_tokenize = [texts]
......@@ -182,6 +199,10 @@ class Transformer(nn.Module):
with open(sbert_config_path) as fIn:
config = json.load(fIn)
# Don't allow configs to set trust_remote_code
if "model_args" in config:
if "model_args" in config and "trust_remote_code" in config["model_args"]:
config["model_args"].pop("trust_remote_code")
if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
config["tokenizer_args"].pop("trust_remote_code")
if "config_args" in config and "trust_remote_code" in config["config_args"]:
config["config_args"].pop("trust_remote_code")
return Transformer(model_name_or_path=input_path, **config)
import torch
from torch import Tensor
from torch import nn
from typing import Dict
import os
import json
import os
from typing import Dict
import torch
from torch import Tensor, nn
class WeightedLayerPooling(nn.Module):
"""
Token embeddings are weighted mean of their different hidden layer representations
"""
"""Token embeddings are weighted mean of their different hidden layer representations"""
def __init__(
self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights=None
......
import gzip
import json
import logging
import os
from typing import List
import numpy as np
import torch
from torch import nn
from typing import List
import logging
import gzip
from tqdm import tqdm
import numpy as np
import os
import json
from ..util import import_from_string, fullname, http_get
from .tokenizer import WordTokenizer, WhitespaceTokenizer
from sentence_transformers.util import fullname, http_get, import_from_string
from .tokenizer import WhitespaceTokenizer, WordTokenizer
logger = logging.getLogger(__name__)
......
import torch
from torch import Tensor
from torch import nn
from typing import List, Dict
import os
import json
import logging
import os
from typing import Dict, List
import torch
from torch import Tensor, nn
logger = logging.getLogger(__name__)
......@@ -15,13 +14,14 @@ class WordWeights(nn.Module):
def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
"""
:param vocab:
Vocabulary of the tokenizer
:param word_weights:
Mapping of tokens to a float weight value. Words embeddings are multiplied by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
:param unknown_word_weight:
Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
Initializes the WordWeights class.
Args:
vocab (List[str]): Vocabulary of the tokenizer.
word_weights (Dict[str, float]): Mapping of tokens to a float weight value. Word embeddings are multiplied
by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values).
unknown_word_weight (float, optional): Weight for words in vocab that do not appear in the word_weights lookup.
These can be, for example, rare words in the vocab where no weight exists. Defaults to 1.
"""
super(WordWeights, self).__init__()
self.config_keys = ["vocab", "word_weights", "unknown_word_weight"]
......
from .Transformer import Transformer
from .Asym import Asym
from .BoW import BoW
from .CLIPModel import CLIPModel
from .CNN import CNN
from .Dense import Dense
from .Dropout import Dropout
......@@ -8,10 +8,10 @@ from .LayerNorm import LayerNorm
from .LSTM import LSTM
from .Normalize import Normalize
from .Pooling import Pooling
from .Transformer import Transformer
from .WeightedLayerPooling import WeightedLayerPooling
from .WordEmbeddings import WordEmbeddings
from .WordWeights import WordWeights
from .CLIPModel import CLIPModel
__all__ = [
"Transformer",
......
from typing import List, Iterable
import collections
import string
import os
import json
import logging
from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
from transformers.utils.import_utils import is_nltk_available, NLTK_IMPORT_ERROR
import os
import string
from typing import Iterable, List
from transformers.utils.import_utils import NLTK_IMPORT_ERROR, is_nltk_available
from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
logger = logging.getLogger(__name__)
......
from typing import List, Iterable
import collections
import string
import os
import json
from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
import os
import string
from typing import Iterable, List
from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
class WhitespaceTokenizer(WordTokenizer):
......
from abc import ABC, abstractmethod
from typing import List, Iterable
from typing import Iterable, List
ENGLISH_STOP_WORDS = [
"!",
......
from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
from .WhitespaceTokenizer import WhitespaceTokenizer
from .PhraseTokenizer import PhraseTokenizer
from .WhitespaceTokenizer import WhitespaceTokenizer
from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer
__all__ = ["WordTokenizer", "WhitespaceTokenizer", "PhraseTokenizer", "ENGLISH_STOP_WORDS"]
import time
from torch import Tensor
from typing import List, Literal, Tuple, TYPE_CHECKING
import numpy as np
import logging
from typing import Dict, Optional, Union
import time
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
import numpy as np
from torch import Tensor
logger = logging.getLogger(__name__)
......@@ -37,31 +36,53 @@ def semantic_search_faiss(
Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
keep `top_k`.
:param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
:param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
:param corpus_index: FAISS index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both.
:param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
Default is "float32".
:param top_k: Number of top results to retrieve. Default is 10.
:param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
Default is None, which means that the ranges will be calculated from the calibration embeddings.
:param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
values for each dimension. Default is None, which means that the ranges will be calculated from the query
embeddings. This is not recommended.
:param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
:param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
and then rescore to only keep `top_k`. Default is 2.
:param exact: Whether to use exact search or approximate search. Default is True.
:param output_index: Whether to output the FAISS index used for the search. Default is False.
:return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
the tuple will also contain the FAISS index used for the search.
:raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
Args:
query_embeddings: Embeddings of the query sentences. Ideally not
quantized to allow for rescoring.
corpus_embeddings: Embeddings of the corpus sentences. Either
`corpus_embeddings` or `corpus_index` should be used, not
both. The embeddings can be quantized to "int8" or "binary"
for more efficient search.
corpus_index: FAISS index for the corpus sentences. Either
`corpus_embeddings` or `corpus_index` should be used, not
both.
corpus_precision: Precision of the corpus embeddings. The
options are "float32", "int8", or "binary". Default is
"float32".
top_k: Number of top results to retrieve. Default is 10.
ranges: Ranges for quantization of embeddings. This is only used
for int8 quantization, where the ranges refers to the
minimum and maximum values for each dimension. So, it's a 2D
array with shape (2, embedding_dim). Default is None, which
means that the ranges will be calculated from the
calibration embeddings.
calibration_embeddings: Embeddings used for calibration during
quantization. This is only used for int8 quantization, where
the calibration embeddings can be used to compute ranges,
i.e. the minimum and maximum values for each dimension.
Default is None, which means that the ranges will be
calculated from the query embeddings. This is not
recommended.
rescore: Whether to perform rescoring. Note that rescoring still
will only be used if the query embeddings are not quantized
and the corpus is quantized, i.e. the corpus precision is
not "float32". Default is True.
rescore_multiplier: Oversampling factor for rescoring. The code
will now search `top_k * rescore_multiplier` samples and
then rescore to only keep `top_k`. Default is 2.
exact: Whether to use exact search or approximate search.
Default is True.
output_index: Whether to output the FAISS index used for the
search. Default is False.
Returns:
A tuple containing a list of search results and the time taken
for the search. If `output_index` is True, the tuple will also
contain the FAISS index used for the search.
Raises:
ValueError: If both `corpus_embeddings` and `corpus_index` are
provided or if neither is provided.
The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
The time taken for the search is a float value.
......@@ -182,37 +203,59 @@ def semantic_search_usearch(
Only if these conditions are true, will we search for `top_k * rescore_multiplier` samples and then rescore to only
keep `top_k`.
:param query_embeddings: Embeddings of the query sentences. Ideally not quantized to allow for rescoring.
:param corpus_embeddings: Embeddings of the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both. The embeddings can be quantized to "int8" or "binary" for more efficient search.
:param corpus_index: usearch index for the corpus sentences. Either `corpus_embeddings` or `corpus_index` should
be used, not both.
:param corpus_precision: Precision of the corpus embeddings. The options are "float32", "int8", or "binary".
Default is "float32".
:param top_k: Number of top results to retrieve. Default is 10.
:param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
Default is None, which means that the ranges will be calculated from the calibration embeddings.
:param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
values for each dimension. Default is None, which means that the ranges will be calculated from the query
embeddings. This is not recommended.
:param rescore: Whether to perform rescoring. Note that rescoring still will only be used if the query embeddings
are not quantized and the corpus is quantized, i.e. the corpus precision is not "float32". Default is True.
:param rescore_multiplier: Oversampling factor for rescoring. The code will now search `top_k * rescore_multiplier` samples
and then rescore to only keep `top_k`. Default is 2.
:param exact: Whether to use exact search or approximate search. Default is True.
:param output_index: Whether to output the usearch index used for the search. Default is False.
:return: A tuple containing a list of search results and the time taken for the search. If `output_index` is True,
the tuple will also contain the usearch index used for the search.
:raises ValueError: If both `corpus_embeddings` and `corpus_index` are provided or if neither is provided.
Args:
query_embeddings: Embeddings of the query sentences. Ideally not
quantized to allow for rescoring.
corpus_embeddings: Embeddings of the corpus sentences. Either
`corpus_embeddings` or `corpus_index` should be used, not
both. The embeddings can be quantized to "int8" or "binary"
for more efficient search.
corpus_index: usearch index for the corpus sentences. Either
`corpus_embeddings` or `corpus_index` should be used, not
both.
corpus_precision: Precision of the corpus embeddings. The
options are "float32", "int8", or "binary". Default is
"float32".
top_k: Number of top results to retrieve. Default is 10.
ranges: Ranges for quantization of embeddings. This is only used
for int8 quantization, where the ranges refers to the
minimum and maximum values for each dimension. So, it's a 2D
array with shape (2, embedding_dim). Default is None, which
means that the ranges will be calculated from the
calibration embeddings.
calibration_embeddings: Embeddings used for calibration during
quantization. This is only used for int8 quantization, where
the calibration embeddings can be used to compute ranges,
i.e. the minimum and maximum values for each dimension.
Default is None, which means that the ranges will be
calculated from the query embeddings. This is not
recommended.
rescore: Whether to perform rescoring. Note that rescoring still
will only be used if the query embeddings are not quantized
and the corpus is quantized, i.e. the corpus precision is
not "float32". Default is True.
rescore_multiplier: Oversampling factor for rescoring. The code
will now search `top_k * rescore_multiplier` samples and
then rescore to only keep `top_k`. Default is 2.
exact: Whether to use exact search or approximate search.
Default is True.
output_index: Whether to output the usearch index used for the
search. Default is False.
Returns:
A tuple containing a list of search results and the time taken
for the search. If `output_index` is True, the tuple will also
contain the usearch index used for the search.
Raises:
ValueError: If both `corpus_embeddings` and `corpus_index` are
provided or if neither is provided.
The list of search results is in the format: [[{"corpus_id": int, "score": float}, ...], ...]
The time taken for the search is a float value.
"""
from usearch.index import Index
from usearch.compiled import ScalarKind
from usearch.index import Index
if corpus_embeddings is not None and corpus_index is not None:
raise ValueError("Only corpus_embeddings or corpus_index should be used, not both.")
......@@ -239,7 +282,7 @@ def semantic_search_usearch(
corpus_index = Index(
ndim=corpus_embeddings.shape[1],
metric="hamming",
dtype="i8",
dtype="b1",
)
corpus_index.add(np.arange(len(corpus_embeddings)), corpus_embeddings)
......@@ -327,18 +370,27 @@ def quantize_embeddings(
Quantizes embeddings to a lower precision. This can be used to reduce the memory footprint and increase the
speed of similarity search. The supported precisions are "float32", "int8", "uint8", "binary", and "ubinary".
:param embeddings: Unquantized (e.g. float) embeddings with to quantize to a given precision
:param precision: The precision to convert to. Options are "float32", "int8", "uint8", "binary", "ubinary".
:param ranges: Ranges for quantization of embeddings. This is only used for int8 quantization, where the ranges
refers to the minimum and maximum values for each dimension. So, it's a 2D array with shape (2, embedding_dim).
Default is None, which means that the ranges will be calculated from the calibration embeddings.
:type ranges: Optional[np.ndarray]
:param calibration_embeddings: Embeddings used for calibration during quantization. This is only used for int8
quantization, where the calibration embeddings can be used to compute ranges, i.e. the minimum and maximum
values for each dimension. Default is None, which means that the ranges will be calculated from the query
embeddings. This is not recommended.
:type calibration_embeddings: Optional[np.ndarray]
:return: Quantized embeddings with the specified precision
Args:
embeddings: Unquantized (e.g. float) embeddings with to quantize
to a given precision
precision: The precision to convert to. Options are "float32",
"int8", "uint8", "binary", "ubinary".
ranges (Optional[np.ndarray]): Ranges for quantization of
embeddings. This is only used for int8 quantization, where
the ranges refers to the minimum and maximum values for each
dimension. So, it's a 2D array with shape (2,
embedding_dim). Default is None, which means that the ranges
will be calculated from the calibration embeddings.
calibration_embeddings (Optional[np.ndarray]): Embeddings used
for calibration during quantization. This is only used for
int8 quantization, where the calibration embeddings can be
used to compute ranges, i.e. the minimum and maximum values
for each dimension. Default is None, which means that the
ranges will be calculated from the query embeddings. This is
not recommended.
Returns:
Quantized embeddings with the specified precision
"""
if isinstance(embeddings, Tensor):
embeddings = embeddings.cpu().numpy()
......
from typing import Union, List
from typing import List, Union
class InputExample:
"""
Structure for one input example with texts, the label and a unique id
"""
"""Structure for one input example with texts, the label and a unique id"""
def __init__(self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0):
"""
Creates one InputExample with the given texts, guid and label
:param guid
id for the example
:param texts
the texts for the example.
:param label
the label for the example
Args:
guid: id for the example
texts: the texts for the example.
label: the label for the example
"""
self.guid = guid
self.texts = texts
......
from . import InputExample
import os
from . import InputExample
class LabelSentenceReader:
"""Reads in a file that has at least two columns: a label and a sentence.
This reader can for example be used with the BatchHardTripletLoss.
Maps labels automatically to integers"""
Maps labels automatically to integers
"""
def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator="\t"):
self.folder = folder
......
from . import InputExample
import gzip
import os
from . import InputExample
class NLIDataReader(object):
"""
Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
"""
"""Reads in the Stanford NLI dataset and the MultiGenre NLI dataset"""
def __init__(self, dataset_folder):
self.dataset_folder = dataset_folder
......
from . import InputExample
import gzip
from . import InputExample
class PairedFilesReader(object):
"""
Reads in the a Pair Dataset, split in two files
"""
"""Reads in the a Pair Dataset, split in two files"""
def __init__(self, filepaths):
self.filepaths = filepaths
def get_examples(self, max_examples=0):
""" """
fIns = []
for filepath in self.filepaths:
fIn = (
......
from . import InputExample
import csv
import gzip
import os
from . import InputExample
class STSDataReader:
"""
Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
"""Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
Default values expects a tab separated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
"""
......@@ -34,9 +34,7 @@ class STSDataReader:
self.max_score = max_score
def get_examples(self, filename, max_examples=0):
"""
filename specified which data split to use (train.csv, dev.csv, test.csv).
"""
"""filename specified which data split to use (train.csv, dev.csv, test.csv)."""
filepath = os.path.join(self.dataset_folder, filename)
with gzip.open(filepath, "rt", encoding="utf8") if filename.endswith(".gz") else open(
filepath, encoding="utf-8"
......@@ -59,8 +57,7 @@ class STSDataReader:
class STSBenchmarkDataReader(STSDataReader):
"""
Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
"""Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
Scores are normalized from 0...5 to 0...1
"""
......
from . import InputExample
import csv
import os
from . import InputExample
class TripletReader(object):
"""
Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
"""Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
one positive example (s2) and one negative example (s3)
"""
......@@ -28,7 +28,6 @@ class TripletReader(object):
self.quoting = quoting
def get_examples(self, filename, max_examples=0):
""" """
data = csv.reader(
open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
delimiter=self.delimiter,
......
from .InputExample import InputExample
from .LabelSentenceReader import LabelSentenceReader
from .NLIDataReader import NLIDataReader
from .STSDataReader import STSDataReader, STSBenchmarkDataReader
from .STSDataReader import STSBenchmarkDataReader, STSDataReader
from .TripletReader import TripletReader
__all__ = [
......
import logging
from collections import defaultdict
from itertools import accumulate, cycle
from typing import List
import torch
from torch.utils.data import BatchSampler, ConcatDataset, SubsetRandomSampler
from sentence_transformers.util import is_datasets_available
if is_datasets_available():
from datasets import Dataset
logger = logging.getLogger(__name__)
class SetEpochMixin:
"""
Required for a BatchSampler as the Trainer will call set_epoch on the BatchSampler at the beginning of each epoch.
The BatchSampler can then set the generator seed accordingly.
"""
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.epoch = 0
def set_epoch(self, epoch: int):
self.epoch = epoch
class DefaultBatchSampler(SetEpochMixin, BatchSampler):
pass
class GroupByLabelBatchSampler(SetEpochMixin, BatchSampler):
def __init__(
self,
dataset: "Dataset",
batch_size: int,
drop_last: bool,
valid_label_columns: List[str] = None,
generator: torch.Generator = None,
seed: int = 0,
):
super().__init__(dataset, batch_size, drop_last)
self.dataset = dataset
self.batch_size = batch_size
self.drop_last = drop_last
self.generator = generator
self.seed = seed
if self.batch_size % 2 == 1:
raise ValueError("The batch size for `GroupByLabelBatchSampler` must be divisible by 2.")
for column_name in valid_label_columns or []:
if column_name in dataset.column_names:
labels = dataset["label"]
break
else:
raise ValueError(f"None of the valid_label_columns {valid_label_columns} are in the dataset.")
del dataset
groups = defaultdict(list)
for sample_idx, label in enumerate(labels):
groups[label].append(sample_idx)
self.groups = {
label: sample_indices[:num_samples]
for label, sample_indices in groups.items()
if (num_samples := len(sample_indices) // 2)
}
def __iter__(self):
if self.generator and self.seed:
self.generator.manual_seed(self.seed + self.epoch)
labels = list(self.groups.keys())
partial_batch = []
for label_idx in torch.randperm(len(self.groups), generator=self.generator):
label = labels[label_idx]
samples = self.groups[label]
partial_batch.extend(samples)
while len(partial_batch) >= self.batch_size:
yield partial_batch[: self.batch_size]
partial_batch = partial_batch[self.batch_size :]
if not self.drop_last and partial_batch:
yield partial_batch
class NoDuplicatesBatchSampler(SetEpochMixin, BatchSampler):
def __init__(
self,
dataset: "Dataset",
batch_size: int,
drop_last: bool,
valid_label_columns: List[str] = [],
generator: torch.Generator = None,
seed: int = 0,
):
super().__init__(dataset, batch_size, drop_last)
if label_columns := set(dataset.column_names) & (set(valid_label_columns) | {"dataset_name"}):
dataset = dataset.remove_columns(label_columns)
self.dataset = dataset
self.batch_size = batch_size
self.drop_last = drop_last
self.generator = generator
self.seed = seed
def __iter__(self):
"""
Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the
batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield
the batch indices and continue with the next batch.
"""
if self.generator and self.seed:
self.generator.manual_seed(self.seed + self.epoch)
remaining_indices = set(torch.randperm(len(self.dataset), generator=self.generator).tolist())
while remaining_indices:
batch_values = set()
batch_indices = []
for index in remaining_indices:
sample_values = set(self.dataset[index].values())
if sample_values & batch_values:
continue
batch_indices.append(index)
if len(batch_indices) == self.batch_size:
yield batch_indices
break
batch_values.update(sample_values)
else:
# NOTE: some indices might still have been ignored here
if not self.drop_last:
yield batch_indices
remaining_indices -= set(batch_indices)
def __len__(self) -> int:
if self.drop_last:
return len(self.dataset) // self.batch_size
else:
return (len(self.dataset) + self.batch_size - 1) // self.batch_size
class RoundRobinBatchSampler(SetEpochMixin, BatchSampler):
def __init__(
self,
dataset: ConcatDataset,
batch_samplers: List[BatchSampler],
generator: torch.Generator,
seed: int,
):
super().__init__(dataset, batch_samplers[0].batch_size, batch_samplers[0].drop_last)
self.dataset = dataset
self.batch_samplers = batch_samplers
self.generator = generator
self.seed = seed
def __iter__(self):
self.generator.manual_seed(self.seed + self.epoch)
num_samples = [len(dataset) for dataset in self.dataset.datasets]
sample_offsets = [0] + list(accumulate(num_samples))
batch_samplers = [iter(sampler) for sampler in self.batch_samplers]
for dataset_idx in cycle(range(len(batch_samplers))):
sample_offset = sample_offsets[dataset_idx]
try:
yield [idx + sample_offset for idx in next(batch_samplers[dataset_idx])]
except StopIteration:
# current iterator is apparently exhausted
break
def __len__(self) -> int:
return min([len(sampler) for sampler in self.batch_samplers]) * len(self.batch_samplers)
class ProportionalBatchSampler(SetEpochMixin, BatchSampler):
def __init__(
self,
dataset: ConcatDataset,
batch_samplers: List[BatchSampler],
generator: torch.Generator,
seed: int,
):
super().__init__(dataset, batch_samplers[0].batch_size, batch_samplers[0].drop_last)
self.dataset = dataset
self.batch_samplers = batch_samplers
self.generator = generator
self.seed = seed
def __iter__(self):
self.generator.manual_seed(self.seed + self.epoch)
num_samples = [len(dataset) for dataset in self.dataset.datasets]
sample_offsets = [0] + list(accumulate(num_samples))
num_batches = [len(sampler) for sampler in self.batch_samplers]
dataset_indices = [idx for idx, length in enumerate(num_batches) for _ in range(length)]
dataset_idx_sampler = SubsetRandomSampler(dataset_indices, generator=self.generator)
batch_samplers = [iter(sampler) for sampler in self.batch_samplers]
for dataset_idx in dataset_idx_sampler:
sample_offset = sample_offsets[dataset_idx]
yield [idx + sample_offset for idx in next(batch_samplers[dataset_idx])]
def __len__(self) -> int:
return sum([len(sampler) for sampler in self.batch_samplers])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment