First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/examples/training/clip/train_clip.ipynb
+++ b/examples/training/clip/train_clip.ipynb
--- a/examples/training/cross-encoder/README.md
+++ b/examples/training/cross-encoder/README.md
+# Cross-Encoders
+SentenceTransformers also supports the option to train Cross-Encoders for sentence pair score and sentence pair classification tasks. For more details on what Cross-Encoders are and the difference between Cross- and Bi-Encoders, see [Cross-Encoders](../../applications/cross-encoder/README.md).
+
+## Examples
+See the following examples how to train Cross-Encoders:
+- [training_stsbenchmark.py](training_stsbenchmark.py) - Example how to train for Semantic Textual Similarity (STS) on the STS benchmark dataset.
+- [training_quora_duplicate_questions.py](training_quora_duplicate_questions.py) - Example how to train a Cross-Encoder to predict if two questions are duplicates. Uses Quora Duplicate Questions as training dataset.
+- [training_nli.py](training_nli.py) - Example for a multilabel classification task for Natural Language Inference (NLI) task.
+
+## Training CrossEncoders
+
+The `CrossEncoder` class is a wrapper around Huggingface `AutoModelForSequenceClassification`, but with some methods to make training and predicting scores a little bit easier. The saved models are 100% compatible with Huggingface and can also be loaded with their classes.
+
+First, you need some sentence pair data. You can either have a continuous score, like:
+```python
+from sentence_transformers import InputExample
+
+train_samples = [
+    InputExample(texts=["sentence1", "sentence2"], label=0.3),
+    InputExample(texts=["Another", "pair"], label=0.8),
+]
+```
+
+Or you have distinct classes as in the [training_nli.py](training_nli.py) example:
+```python
+from sentence_transformers import InputExample
+
+label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
+train_samples = [
+    InputExample(texts=["sentence1", "sentence2"], label=label2int["neutral"]),
+    InputExample(texts=["Another", "pair"], label=label2int["entailment"]),
+]
+```
+
+Then, you define the base model and the number of labels. You can take any [Huggingface pre-trained model](https://huggingface.co/transformers/pretrained_models.html) that is compatible with AutoModel:
+```
+model = CrossEncoder('distilroberta-base', num_labels=1)
+```
+
+For binary tasks and tasks with continuous scores (like STS), we set num_labels=1. For classification tasks, we set it to the number of labels we have.
+
+We start the training by calling `model.fit()`:
+```python
+model.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+```
+
+
+
--- a/examples/training/cross-encoder/training_nli.py
+++ b/examples/training/cross-encoder/training_nli.py
+"""
+This examples trains a CrossEncoder for the NLI task. A CrossEncoder takes a sentence pair
+as input and outputs a label. Here, it learns to predict the labels: "contradiction": 0, "entailment": 1, "neutral": 2.
+
+It does NOT produce a sentence embedding and does NOT work for individual sentences.
+
+Usage:
+python training_nli.py
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CEF1Evaluator, CESoftmaxAccuracyEvaluator
+from sentence_transformers.evaluation import SequentialEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+
+# As dataset, we use SNLI + MultiNLI
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "datasets/AllNLI.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logger.info("Read AllNLI train dataset")
+
+label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
+train_samples = []
+dev_samples = []
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        label_id = label2int[row["label"]]
+        if row["split"] == "train":
+            train_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
+        else:
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
+
+
+train_batch_size = 16
+num_epochs = 4
+model_save_path = "output/training_allnli-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+# Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
+model = CrossEncoder("distilroberta-base", num_labels=len(label2int))
+
+# We wrap train_samples, which is a list of InputExample, in a pytorch DataLoader
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+
+# During training, we use CESoftmaxAccuracyEvaluator and CEF1Evaluator to measure the performance on the dev set
+accuracy_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name="AllNLI-dev")
+f1_evaluator = CEF1Evaluator.from_input_examples(dev_samples, name="AllNLI-dev")
+evaluator = SequentialEvaluator([accuracy_evaluator, f1_evaluator])
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logger.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=10000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
--- a/examples/training/cross-encoder/training_quora_duplicate_questions.py
+++ b/examples/training/cross-encoder/training_quora_duplicate_questions.py
+"""
+This examples trains a CrossEncoder for the Quora Duplicate Questions Detection task. A CrossEncoder takes a sentence pair
+as input and outputs a label. Here, it output a continuous labels 0...1 to indicate the similarity between the input pair.
+
+It does NOT produce a sentence embedding and does NOT work for individual sentences.
+
+Usage:
+python training_quora_duplicate_questions.py
+
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import os
+import csv
+from zipfile import ZipFile
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+dataset_path = "quora-dataset/"
+
+if not os.path.exists(dataset_path):
+    logger.info("Dataset not found. Download")
+    zip_save_path = "quora-IR-dataset.zip"
+    util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
+    with ZipFile(zip_save_path, "r") as zip:
+        zip.extractall(dataset_path)
+
+
+# Read the quora dataset split for classification
+logger.info("Read train dataset")
+train_samples = []
+with open(os.path.join(dataset_path, "classification", "train_pairs.tsv"), "r", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        train_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
+        train_samples.append(InputExample(texts=[row["question2"], row["question1"]], label=int(row["is_duplicate"])))
+
+
+logger.info("Read dev dataset")
+dev_samples = []
+with open(os.path.join(dataset_path, "classification", "dev_pairs.tsv"), "r", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
+
+
+# Configuration
+train_batch_size = 16
+num_epochs = 4
+model_save_path = "output/training_quora-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# We use distilroberta-base with a single label, i.e., it will output a value between 0 and 1 indicating the similarity of the two questions
+model = CrossEncoder("distilroberta-base", num_labels=1)
+
+# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+
+
+# We add an evaluator, which evaluates the performance during training
+evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples, name="Quora-dev")
+
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logger.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=5000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
--- a/examples/training/cross-encoder/training_stsbenchmark.py
+++ b/examples/training/cross-encoder/training_stsbenchmark.py
+"""
+This examples trains a CrossEncoder for the STSbenchmark task. A CrossEncoder takes a sentence pair
+as input and outputs a label. Here, it output a continuous labels 0...1 to indicate the similarity between the input pair.
+
+It does NOT produce a sentence embedding and does NOT work for individual sentences.
+
+Usage:
+python training_stsbenchmark.py
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
+from sentence_transformers import InputExample
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Define our Cross-Encoder
+train_batch_size = 16
+num_epochs = 4
+model_save_path = "output/training_stsbenchmark-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+# We use distilroberta-base as base model and set num_labels=1, which predicts a continuous score between 0 and 1
+model = CrossEncoder("distilroberta-base", num_labels=1)
+
+
+# Read STSb dataset
+logger.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        else:
+            # As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set
+            train_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+            train_samples.append(InputExample(texts=[row["sentence2"], row["sentence1"]], label=score))
+
+
+# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+
+
+# We add an evaluator, which evaluates the performance during training
+evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logger.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##### Load model and eval on test set
+model = CrossEncoder(model_save_path)
+
+evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name="sts-test")
+evaluator(model)
--- a/examples/training/data_augmentation/README.md
+++ b/examples/training/data_augmentation/README.md
+# Augmented SBERT
+
+## Motivation
+
+Bi-encoders (a.k.a. sentence embeddings models) require substantial training data and fine-tuning over the target task to achieve competitive performances. However, in many scenarios, there is only little training data available.
+ 
+ To solve this practical issue, we release an effective data-augmentation strategy known as <b>Augmented SBERT</b> where we utilize a high performing and slow cross-encoder (BERT) to label a larger set of input pairs to augment the training data for the bi-encoder (SBERT).
+
+For more details, refer to our publication - [Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks](https://arxiv.org/abs/2010.08240) which is a joint effort by Nandan Thakur, Nils Reimers and Johannes Daxenberger of UKP Lab, TU Darmstadt.
+
+Chien Vu also wrote a nice blog article on this technique: [Advance BERT model via transferring knowledge from Cross-Encoders to Bi-Encoders](https://towardsdatascience.com/advance-nlp-model-via-transferring-knowledge-from-cross-encoders-to-bi-encoders-3e0fc564f554)
+
+## Extend to your own datasets
+
+**Scenario 1: Limited or small annotated datasets (few labeled sentence-pairs (1k-3k))**\
+If you have specialized datasets in your company or research which are small-sized or contain labeled few sentence-pairs. You can extend the idea of Augmented SBERT (in-domain) strategy by training a cross-encoder over your small gold  dataset and use BM25 sampling to generate combinations not seen earlier. Use the cross-encoder to label these unlabeled pairs to create the silver dataset. Finally train a bi-encoder (i.e. SBERT) over your extended dataset (gold+silver) dataset as shown in [train_sts_indomain_bm25.py](train_sts_indomain_bm25.py).
+
+**Scenario 2: No annotated datasets (Only unlabeled sentence-pairs)**\
+If you have specialized datasets in your company or research which only contain unlabeled sentence-pairs. You can extend the idea of Augmented SBERT (domain-transfer) strategy by training a cross-encoder over a source dataset which is annotated (for eg. QQP). Use this cross-encoder to label your specialised unlabeled dataset i.e. target dataset. Finally train a bi-encoder i.e. SBERT over your labeled target dataset as shown in [train_sts_qqp_crossdomain.py](train_sts_qqp_crossdomain.py).
+
+
+## Methodology 
+There are two major scenarios for the Augmented SBERT approach for pairwise-sentence regression or classification tasks. 
+
+## Scenario 1: Limited or small annotated datasets (few labeled sentence-pairs)
+
+We apply the Augmented SBERT (<b>In-domain</b>) strategy, it involves three steps - 
+
+ - Step 1:  Train a cross-encoder (BERT) over the small (gold or annotated) dataset
+
+ - Step 2.1: Create pairs by recombination and reduce the pairs via BM25 or semantic search
+
+ - Step 2.2: Weakly label new pairs with cross-encoder (BERT). These are silver pairs or (silver) dataset
+
+ - Step 3:  Finally, train a bi-encoder (SBERT) on the extended (gold + silver) training dataset
+
+<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/augsbert-indomain.png" width="400" height="500">
+
+## Scenario 2: No annotated datasets (Only unlabeled sentence-pairs)
+
+We apply the Augmented SBERT (<b>Domain-Transfer</b>) strategy, it involves three steps - 
+
+ - Step 1: Train from scratch a cross-encoder (BERT) over a source dataset, for which we contain annotations
+
+ - Step 2: Use this cross-encoder (BERT) to label your target dataset i.e. unlabeled sentence pairs
+
+ - Step 3: Finally, train a bi-encoder (SBERT) on the labeled target dataset
+
+<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/augsbert-domain-transfer.png" width="500" height="300">
+
+
+## Training
+ 
+The [examples/training/data_augmentation](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/data_augmentation/) folder contains simple training examples for each scenario explained below:
+
+- [train_sts_seed_optimization.py](train_sts_seed_optimization.py) 
+    - This script trains a bi-encoder (SBERT) model from scratch for STS benchmark dataset with seed-optimization. 
+    - Seed optimization technique is inspired from [(Dodge et al., 2020)](https://arxiv.org/abs/2002.06305). 
+    - For Seed opt., we train our bi-encoder for various seeds and evaluate using an early stopping algorithm. 
+    - Finally, measure dev performance across the seeds to get the highest performing seeds.
+
+- [train_sts_indomain_nlpaug.py](train_sts_indomain_nlpaug.py)
+    - This script trains a bi-encoder (SBERT) model from scratch for STS benchmark dataset using easy data augmentation. 
+    - Data augmentation strategies are used from popular [nlpaug](https://github.com/makcedward/nlpaug) package.
+    - Augment single sentences with synonyms using (word2vec, BERT or WordNet). Forms our silver dataset.
+    - Train bi-encoder model on both original small training dataset and synonym based silver dataset. 
+
+- [train_sts_indomain_bm25.py](train_sts_indomain_bm25.py)
+    - Script initially trains a cross-encoder (BERT) model from scratch for small STS benchmark dataset.
+    - Recombine sentences from our small training dataset and form lots of sentence-pairs.
+    - Limit number of combinations with BM25 sampling using [Elasticsearch](https://www.elastic.co/).
+    - Retrieve top-k sentences given a sentence and label these pairs using the cross-encoder (silver dataset).
+    - Train a bi-encoder (SBERT) model on both gold + silver STSb dataset. (Augmented SBERT (In-domain) Strategy).
+
+- [train_sts_indomain_semantic.py](train_sts_indomain_semantic.py)
+    - This script initially trains a cross-encoder (BERT) model from scratch for small STS benchmark dataset.
+    - We recombine sentences from our small training dataset and form lots of sentence-pairs.
+    - Limit number of combinations with Semantic Search sampling using pretrained SBERT model.
+    - Retrieve top-k sentences given a sentence and label these pairs using the cross-encoder (silver dataset).
+    - Train a bi-encoder (SBERT) model on both gold + silver STSb dataset. (Augmented SBERT (In-domain) Strategy).
+
+- [train_sts_qqp_crossdomain.py](train_sts_qqp_crossdomain.py)
+    - This script initially trains a cross-encoder (BERT) model from scratch for STS benchmark dataset.
+    - Label the Quora Questions Pair (QQP) training dataset (Assume no labels present) using the cross-encoder.
+    - Train a bi-encoder (SBERT) model on the QQP dataset. (Augmented SBERT (Domain-Transfer) Strategy).
+
+
+
+## Citation
+If you use the code for augmented sbert, feel free to cite our publication [Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks](https://arxiv.org/abs/2010.08240):
+``` 
+@article{thakur-2020-AugSBERT,
+    title = "Augmented SBERT: Data Augmentation Method for Improving Bi-Encoders for Pairwise Sentence Scoring Tasks",
+    author = "Thakur, Nandan and Reimers, Nils and Daxenberger, Johannes and  Gurevych, Iryna", 
+    journal= "arXiv preprint arXiv:2010.08240",
+    month = "10",
+    year = "2020",
+    url = "https://arxiv.org/abs/2010.08240",
+}
+```
\ No newline at end of file
--- a/examples/training/data_augmentation/train_sts_indomain_bm25.py
+++ b/examples/training/data_augmentation/train_sts_indomain_bm25.py
+"""
+The script shows how to train Augmented SBERT (In-Domain) strategy for STSb dataset with BM25 sampling.
+We utlise easy and practical elasticsearch (https://www.elastic.co/) for BM25 sampling.
+
+Installations:
+For this example, elasticsearch to be installed (pip install elasticsearch)
+[NOTE] You need to also install Elasticsearch locally on your PC or desktop.
+link for download - https://www.elastic.co/downloads/elasticsearch
+Or to run it with Docker: https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html
+
+Methodology:
+Three steps are followed for AugSBERT data-augmentation with BM25 Sampling -
+    1. Fine-tune cross-encoder (BERT) on gold STSb dataset
+    2. Fine-tuned Cross-encoder is used to label on BM25 sampled unlabeled pairs (silver STSb dataset)
+    3. Bi-encoder (SBERT) is finally fine-tuned on both gold + silver STSb dataset
+
+Citation: https://arxiv.org/abs/2010.08240
+
+Usage:
+python train_sts_indomain_bm25.py
+
+OR
+python train_sts_indomain_bm25.py pretrained_transformer_model_name top_k
+
+python train_sts_indomain_bm25.py bert-base-uncased 3
+
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import models, losses, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+from elasticsearch import Elasticsearch
+from datetime import datetime
+import logging
+import csv
+import sys
+import tqdm
+import math
+import gzip
+import os
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# suppressing INFO messages for elastic-search logger
+tracer = logging.getLogger("elasticsearch")
+tracer.setLevel(logging.CRITICAL)
+es = Elasticsearch()
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
+top_k = int(sys.argv[2]) if len(sys.argv) > 2 else 3
+
+batch_size = 16
+num_epochs = 1
+max_seq_length = 128
+
+###### Read Datasets ######
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+cross_encoder_path = (
+    "output/cross-encoder/stsb_indomain_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+bi_encoder_path = (
+    "output/bi-encoder/stsb_augsbert_BM25_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+###### Cross-encoder (simpletransformers) ######
+logging.info("Loading sentence-transformers model: {}".format(model_name))
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model
+cross_encoder = CrossEncoder(model_name, num_labels=1)
+
+
+###### Bi-encoder (sentence-transformers) ######
+logging.info("Loading bi-encoder model: {}".format(model_name))
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+
+bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+#####################################################################
+#
+# Step 1: Train cross-encoder model with (gold) STS benchmark dataset
+#
+#####################################################################
+
+logging.info("Step 1: Train cross-encoder: ({}) with STSbenchmark".format(model_name))
+
+gold_samples = []
+dev_samples = []
+test_samples = []
+
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        else:
+            # As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set
+            gold_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+            gold_samples.append(InputExample(texts=[row["sentence2"], row["sentence1"]], label=score))
+
+
+# We wrap gold_samples (which is a List[InputExample]) into a pytorch DataLoader
+train_dataloader = DataLoader(gold_samples, shuffle=True, batch_size=batch_size)
+
+
+# We add an evaluator, which evaluates the performance during training
+evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the cross-encoder model
+cross_encoder.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=cross_encoder_path,
+)
+
+############################################################################
+#
+# Step 2: Label BM25 sampled STSb (silver dataset) using cross-encoder model
+#
+############################################################################
+
+#### Top k similar sentences to be retrieved ####
+#### Larger the k, bigger the silver dataset ####
+
+index_name = "stsb"  # index-name should be in lowercase
+logging.info("Step 2.1: Generate STSbenchmark (silver dataset) using top-{} bm25 combinations".format(top_k))
+
+unique_sentences = set()
+
+for sample in gold_samples:
+    unique_sentences.update(sample.texts)
+
+unique_sentences = list(unique_sentences)  # unique sentences
+sent2idx = {sentence: idx for idx, sentence in enumerate(unique_sentences)}  # storing id and sentence in dictionary
+duplicates = set(
+    (sent2idx[data.texts[0]], sent2idx[data.texts[1]]) for data in gold_samples
+)  # not to include gold pairs of sentences again
+
+# Ignore 400 cause by IndexAlreadyExistsException when creating an index
+logging.info("Creating elastic-search index - {}".format(index_name))
+es.indices.create(index=index_name, ignore=[400])
+
+# indexing all sentences
+logging.info("Starting to index....")
+for sent in unique_sentences:
+    response = es.index(index=index_name, id=sent2idx[sent], body={"sent": sent})
+
+logging.info("Indexing complete for {} unique sentences".format(len(unique_sentences)))
+
+silver_data = []
+progress = tqdm.tqdm(unit="docs", total=len(sent2idx))
+
+# retrieval of top-k sentences which forms the silver training data
+for sent, idx in sent2idx.items():
+    res = es.search(index=index_name, body={"query": {"match": {"sent": sent}}}, size=top_k)
+    progress.update(1)
+    for hit in res["hits"]["hits"]:
+        if idx != int(hit["_id"]) and (idx, int(hit["_id"])) not in set(duplicates):
+            silver_data.append((sent, hit["_source"]["sent"]))
+            duplicates.add((idx, int(hit["_id"])))
+
+progress.reset()
+progress.close()
+
+logging.info("Number of silver pairs generated for STSbenchmark: {}".format(len(silver_data)))
+logging.info("Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {}".format(model_name))
+
+cross_encoder = CrossEncoder(cross_encoder_path)
+silver_scores = cross_encoder.predict(silver_data)
+
+# All model predictions should be between [0,1]
+assert all(0.0 <= score <= 1.0 for score in silver_scores)
+
+#################################################################################################
+#
+# Step 3: Train bi-encoder model with both (gold + silver) STSbenchmark dataset - Augmented SBERT
+#
+#################################################################################################
+
+logging.info("Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".format(model_name))
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark gold and silver train dataset")
+silver_samples = list(
+    InputExample(texts=[data[0], data[1]], label=score) for data, score in zip(silver_data, silver_scores)
+)
+
+
+train_dataloader = DataLoader(gold_samples + silver_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=bi_encoder)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training.
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the bi-encoder model
+bi_encoder.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=bi_encoder_path,
+)
+
+######################################################################
+#
+# Evaluate Augmented SBERT performance on STS benchmark (test) dataset
+#
+######################################################################
+
+# load the stored augmented-sbert model
+bi_encoder = SentenceTransformer(bi_encoder_path)
+logging.info("Read STSbenchmark test dataset")
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(bi_encoder, output_path=bi_encoder_path)
--- a/examples/training/data_augmentation/train_sts_indomain_nlpaug.py
+++ b/examples/training/data_augmentation/train_sts_indomain_nlpaug.py
+"""
+The script shows how to train Augmented SBERT (In-Domain) strategy for STSb dataset with nlp textual augmentation.
+We utilise nlpaug (https://github.com/makcedward/nlpaug) for data augmentation strategies over a single sentence.
+
+We chose synonym replacement for our example with (can be extended to other techniques) -
+    1. Word-embeddings (word2vec)
+    2. WordNet
+    3. Contextual word-embeddings (BERT)
+
+Methodology:
+Take a gold STSb pair, like (A, B, 0.6) Then replace synonyms in A and B, which gives you (A', B', 0.6)
+These are the silver data and SBERT is finally trained on (gold + silver) STSb data.
+
+Additional requirements:
+pip install nlpaug
+
+Information:
+We went over the nlpaug package and found from our experience, the commonly used and effective technique
+is synonym replacement with words. However feel free to use any textual data augmentation mentioned
+in the example - (https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb)
+
+You could also extend the easy data augmentation methods for other languages too, a good example can be
+found here - (https://github.com/makcedward/nlpaug/blob/master/example/textual_language_augmenter.ipynb)
+
+
+Citation: https://arxiv.org/abs/2010.08240
+
+Usage:
+python train_sts_indomain_nlpaug.py
+"""
+
+from torch.utils.data import DataLoader
+import torch
+import math
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import nlpaug.augmenter.word as naw
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+batch_size = 16
+num_epochs = 1
+
+###### Read Datasets ######
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+model_save_path = (
+    "output/bi-encoder/stsb_indomain_eda_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+###### Bi-encoder (sentence-transformers) ######
+logging.info("Loading SBERT model: {}".format(model_name))
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Convert the dataset to a DataLoader ready for training
+gold_samples = []
+dev_samples = []
+test_samples = []
+
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            gold_samples.append(inp_example)
+
+##################################################################################
+#
+# Data Augmentation: Synonym Replacement with word2vec, BERT, WordNet using nlpaug
+#
+##################################################################################
+
+logging.info("Starting with synonym replacement...")
+
+#### Synonym replacement using Word2Vec ####
+# Download the word2vec pre-trained Google News corpus (GoogleNews-vectors-negative300.bin)
+# link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
+
+# aug = naw.WordEmbsAug(
+#     model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin',
+#     action="substitute")
+
+#### Synonym replacement using WordNet ####
+# aug = naw.SynonymAug(aug_src='wordnet')
+
+#### Synonym replacement using BERT ####
+aug = naw.ContextualWordEmbsAug(model_path=model_name, action="insert", device=device)
+
+silver_samples = []
+progress = tqdm.tqdm(unit="docs", total=len(gold_samples))
+
+for sample in gold_samples:
+    augmented_texts = aug.augment(sample.texts)
+    inp_example = InputExample(texts=augmented_texts, label=sample.label)
+    silver_samples.append(inp_example)
+    progress.update(1)
+
+progress.reset()
+progress.close()
+logging.info("Textual augmentation completed....")
+logging.info("Number of silver pairs generated: {}".format(len(silver_samples)))
+
+###################################################################
+#
+# Train SBERT model with both (gold + silver) STS benchmark dataset
+#
+###################################################################
+
+logging.info("Read STSbenchmark (gold + silver) training dataset")
+train_dataloader = DataLoader(gold_samples + silver_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training.
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the SBERT model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+##########################################################
+#
+# Evaluate SBERT performance on STS benchmark test dataset
+#
+##########################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
--- a/examples/training/data_augmentation/train_sts_indomain_semantic.py
+++ b/examples/training/data_augmentation/train_sts_indomain_semantic.py
+"""
+The script shows how to train Augmented SBERT (In-Domain) strategy for STSb dataset with Semantic Search Sampling.
+
+
+Methodology:
+Three steps are followed for AugSBERT data-augmentation strategy with Semantic Search -
+    1. Fine-tune cross-encoder (BERT) on gold STSb dataset
+    2. Fine-tuned Cross-encoder is used to label on Sem. Search sampled unlabeled pairs (silver STSb dataset)
+    3. Bi-encoder (SBERT) is finally fine-tuned on both gold + silver STSb dataset
+
+Citation: https://arxiv.org/abs/2010.08240
+
+Usage:
+python train_sts_indomain_semantic.py
+
+OR
+python train_sts_indomain_semantic.py pretrained_transformer_model_name top_k
+
+python train_sts_indomain_semantic.py bert-base-uncased 3
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import models, losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+from datetime import datetime
+import logging
+import csv
+import torch
+import tqdm
+import sys
+import math
+import gzip
+import os
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
+top_k = int(sys.argv[2]) if len(sys.argv) > 2 else 3
+
+batch_size = 16
+num_epochs = 1
+max_seq_length = 128
+
+###### Read Datasets ######
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+cross_encoder_path = (
+    "output/cross-encoder/stsb_indomain_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+bi_encoder_path = (
+    "output/bi-encoder/stsb_augsbert_SS_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+###### Cross-encoder (simpletransformers) ######
+logging.info("Loading cross-encoder model: {}".format(model_name))
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model
+cross_encoder = CrossEncoder(model_name, num_labels=1)
+
+
+###### Bi-encoder (sentence-transformers) ######
+logging.info("Loading bi-encoder model: {}".format(model_name))
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+#####################################################
+#
+# Step 1: Train cross-encoder model with STSbenchmark
+#
+#####################################################
+
+logging.info("Step 1: Train cross-encoder: {} with STSbenchmark (gold dataset)".format(model_name))
+
+gold_samples = []
+dev_samples = []
+test_samples = []
+
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        else:
+            # As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set
+            gold_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+            gold_samples.append(InputExample(texts=[row["sentence2"], row["sentence1"]], label=score))
+
+
+# We wrap gold_samples (which is a List[InputExample]) into a pytorch DataLoader
+train_dataloader = DataLoader(gold_samples, shuffle=True, batch_size=batch_size)
+
+
+# We add an evaluator, which evaluates the performance during training
+evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the cross-encoder model
+cross_encoder.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=cross_encoder_path,
+)
+
+############################################################################
+#
+# Step 2: Find silver pairs to label
+#
+############################################################################
+
+#### Top k similar sentences to be retrieved ####
+#### Larger the k, bigger the silver dataset ####
+
+logging.info(
+    "Step 2.1: Generate STSbenchmark (silver dataset) using pretrained SBERT \
+    model and top-{} semantic search combinations".format(top_k)
+)
+
+silver_data = []
+sentences = set()
+
+for sample in gold_samples:
+    sentences.update(sample.texts)
+
+sentences = list(sentences)  # unique sentences
+sent2idx = {sentence: idx for idx, sentence in enumerate(sentences)}  # storing id and sentence in dictionary
+duplicates = set(
+    (sent2idx[data.texts[0]], sent2idx[data.texts[1]]) for data in gold_samples
+)  # not to include gold pairs of sentences again
+
+
+# For simplicity we use a pretrained model
+semantic_model_name = "paraphrase-MiniLM-L6-v2"
+semantic_search_model = SentenceTransformer(semantic_model_name)
+logging.info("Encoding unique sentences with semantic search model: {}".format(semantic_model_name))
+
+# encoding all unique sentences present in the training dataset
+embeddings = semantic_search_model.encode(sentences, batch_size=batch_size, convert_to_tensor=True)
+
+logging.info("Retrieve top-{} with semantic search model: {}".format(top_k, semantic_model_name))
+
+# retrieving top-k sentences given a sentence from the dataset
+progress = tqdm.tqdm(unit="docs", total=len(sent2idx))
+for idx in range(len(sentences)):
+    sentence_embedding = embeddings[idx]
+    cos_scores = util.cos_sim(sentence_embedding, embeddings)[0]
+    cos_scores = cos_scores.cpu()
+    progress.update(1)
+
+    # We use torch.topk to find the highest 5 scores
+    top_results = torch.topk(cos_scores, k=top_k + 1)
+
+    for score, iid in zip(top_results[0], top_results[1]):
+        if iid != idx and (iid, idx) not in duplicates:
+            silver_data.append((sentences[idx], sentences[iid]))
+            duplicates.add((idx, iid))
+
+progress.reset()
+progress.close()
+
+logging.info("Length of silver_dataset generated: {}".format(len(silver_data)))
+logging.info("Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {}".format(model_name))
+cross_encoder = CrossEncoder(cross_encoder_path)
+silver_scores = cross_encoder.predict(silver_data)
+
+# All model predictions should be between [0,1]
+assert all(0.0 <= score <= 1.0 for score in silver_scores)
+
+############################################################################################
+#
+# Step 3: Train bi-encoder model with both STSbenchmark and labeled AllNlI - Augmented SBERT
+#
+############################################################################################
+
+logging.info("Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".format(model_name))
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark gold and silver train dataset")
+silver_samples = list(
+    InputExample(texts=[data[0], data[1]], label=score) for data, score in zip(silver_data, silver_scores)
+)
+
+
+train_dataloader = DataLoader(gold_samples + silver_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=bi_encoder)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training.
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the bi-encoder model
+bi_encoder.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=bi_encoder_path,
+)
+
+#################################################################################
+#
+# Evaluate cross-encoder and Augmented SBERT performance on STS benchmark dataset
+#
+#################################################################################
+
+# load the stored augmented-sbert model
+bi_encoder = SentenceTransformer(bi_encoder_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(bi_encoder, output_path=bi_encoder_path)
--- a/examples/training/data_augmentation/train_sts_qqp_crossdomain.py
+++ b/examples/training/data_augmentation/train_sts_qqp_crossdomain.py
+"""
+The script shows how to train Augmented SBERT (Domain-Transfer/Cross-Domain) strategy for STSb-QQP dataset.
+For our example below we consider STSb (source) and QQP (target) datasets respectively.
+
+Methodology:
+Three steps are followed for AugSBERT data-augmentation strategy with Domain Transfer / Cross-Domain -
+1. Cross-Encoder aka BERT is trained over STSb (source) dataset.
+2. Cross-Encoder is used to label QQP training (target) dataset (Assume no labels/no annotations are provided).
+3. Bi-encoder aka SBERT is trained over the labeled QQP (target) dataset.
+
+Citation: https://arxiv.org/abs/2010.08240
+
+Usage:
+python train_sts_qqp_crossdomain.py
+
+OR
+python train_sts_qqp_crossdomain.py pretrained_transformer_model_name
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import models, losses, util, LoggingHandler, SentenceTransformer
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
+from sentence_transformers.evaluation import BinaryClassificationEvaluator
+from sentence_transformers.readers import InputExample
+from datetime import datetime
+from zipfile import ZipFile
+import logging
+import csv
+import sys
+import torch
+import math
+import gzip
+import os
+
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
+batch_size = 16
+num_epochs = 1
+max_seq_length = 128
+use_cuda = torch.cuda.is_available()
+
+###### Read Datasets ######
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+qqp_dataset_path = "quora-IR-dataset"
+
+
+# Check if the STSb dataset exists. If not, download and extract it
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Check if the QQP dataset exists. If not, download and extract
+if not os.path.exists(qqp_dataset_path):
+    logging.info("Dataset not found. Download")
+    zip_save_path = "quora-IR-dataset.zip"
+    util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
+    with ZipFile(zip_save_path, "r") as zipIn:
+        zipIn.extractall(qqp_dataset_path)
+
+
+cross_encoder_path = (
+    "output/cross-encoder/stsb_indomain_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+bi_encoder_path = (
+    "output/bi-encoder/qqp_cross_domain_"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+###### Cross-encoder (simpletransformers) ######
+
+logging.info("Loading cross-encoder model: {}".format(model_name))
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model
+cross_encoder = CrossEncoder(model_name, num_labels=1)
+
+###### Bi-encoder (sentence-transformers) ######
+
+logging.info("Loading bi-encoder model: {}".format(model_name))
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+#####################################################
+#
+# Step 1: Train cross-encoder model with STSbenchmark
+#
+#####################################################
+
+logging.info("Step 1: Train cross-encoder: {} with STSbenchmark (source dataset)".format(model_name))
+
+gold_samples = []
+dev_samples = []
+test_samples = []
+
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        else:
+            # As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set
+            gold_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+            gold_samples.append(InputExample(texts=[row["sentence2"], row["sentence1"]], label=score))
+
+
+# We wrap gold_samples (which is a List[InputExample]) into a pytorch DataLoader
+train_dataloader = DataLoader(gold_samples, shuffle=True, batch_size=batch_size)
+
+
+# We add an evaluator, which evaluates the performance during training
+evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the cross-encoder model
+cross_encoder.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=cross_encoder_path,
+)
+
+##################################################################
+#
+# Step 2: Label QQP train dataset using cross-encoder (BERT) model
+#
+##################################################################
+
+logging.info("Step 2: Label QQP (target dataset) with cross-encoder: {}".format(model_name))
+
+cross_encoder = CrossEncoder(cross_encoder_path)
+
+silver_data = []
+
+with open(os.path.join(qqp_dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["is_duplicate"] == "1":
+            silver_data.append([row["question1"], row["question2"]])
+
+silver_scores = cross_encoder.predict(silver_data)
+
+# All model predictions should be between [0,1]
+assert all(0.0 <= score <= 1.0 for score in silver_scores)
+
+binary_silver_scores = [1 if score >= 0.5 else 0 for score in silver_scores]
+
+###########################################################################
+#
+# Step 3: Train bi-encoder (SBERT) model with QQP dataset - Augmented SBERT
+#
+###########################################################################
+
+logging.info("Step 3: Train bi-encoder: {} over labeled QQP (target dataset)".format(model_name))
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Loading BERT labeled QQP dataset")
+qqp_train_data = list(
+    InputExample(texts=[data[0], data[1]], label=score) for (data, score) in zip(silver_data, binary_silver_scores)
+)
+
+
+train_dataloader = DataLoader(qqp_train_data, shuffle=True, batch_size=batch_size)
+train_loss = losses.MultipleNegativesRankingLoss(bi_encoder)
+
+###### Classification ######
+# Given (quesiton1, question2), is this a duplicate or not?
+# The evaluator will compute the embeddings for both questions and then compute
+# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
+logging.info("Read QQP dev dataset")
+
+dev_sentences1 = []
+dev_sentences2 = []
+dev_labels = []
+
+with open(os.path.join(qqp_dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences1.append(row["question1"])
+        dev_sentences2.append(row["question2"])
+        dev_labels.append(int(row["is_duplicate"]))
+
+evaluator = BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
+
+# Configure the training.
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the bi-encoder model
+bi_encoder.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=bi_encoder_path,
+)
+
+###############################################################
+#
+# Evaluate Augmented SBERT performance on QQP benchmark dataset
+#
+###############################################################
+
+# Loading the augmented sbert model
+bi_encoder = SentenceTransformer(bi_encoder_path)
+
+logging.info("Read QQP test dataset")
+test_sentences1 = []
+test_sentences2 = []
+test_labels = []
+
+with open(os.path.join(qqp_dataset_path, "classification/test_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        test_sentences1.append(row["question1"])
+        test_sentences2.append(row["question2"])
+        test_labels.append(int(row["is_duplicate"]))
+
+evaluator = BinaryClassificationEvaluator(test_sentences1, test_sentences2, test_labels)
+bi_encoder.evaluate(evaluator)
--- a/examples/training/data_augmentation/train_sts_seed_optimization.py
+++ b/examples/training/data_augmentation/train_sts_seed_optimization.py
+"""
+This script is identical to examples/training/sts/training_stsbenchmark.py with seed optimization.
+We apply early stopping and evaluate the models over the dev set, to find out the best performing seeds.
+
+For more details refer to -
+Fine-Tuning Pretrained Language Models:
+Weight Initializations, Data Orders, and Early Stopping by Dodge et al. 2020
+https://arxiv.org/pdf/2002.06305.pdf
+
+Why Seed Optimization?
+Dodge et al. (2020) show a high dependence on the random seed for transformer based models like BERT,
+as it converges to different minima that generalize differently to unseen data. This is especially the
+case for small training datasets.
+
+Citation: https://arxiv.org/abs/2010.08240
+
+Usage:
+python train_sts_seed_optimization.py
+
+OR
+python train_sts_seed_optimization.py pretrained_transformer_model_name seed_count stop_after
+
+python train_sts_seed_optimization.py bert-base-uncased 10 0.3
+"""
+
+from torch.utils.data import DataLoader
+import math
+import torch
+import random
+import numpy as np
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+import sys
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
+seed_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10
+stop_after = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3
+
+logging.info("Train and Evaluate: {} Random Seeds".format(seed_count))
+
+for seed in range(seed_count):
+    # Setting seed for all random initializations
+    logging.info("##### Seed {} #####".format(seed))
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    # Read the dataset
+    train_batch_size = 16
+    num_epochs = 1
+    model_save_path = "output/bi-encoder/training_stsbenchmark_" + model_name + "/seed-" + str(seed)
+
+    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+    word_embedding_model = models.Transformer(model_name)
+
+    # Apply mean pooling to get one fixed sized sentence vector
+    pooling_model = models.Pooling(
+        word_embedding_model.get_word_embedding_dimension(),
+        pooling_mode_mean_tokens=True,
+        pooling_mode_cls_token=False,
+        pooling_mode_max_tokens=False,
+    )
+
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+    # Convert the dataset to a DataLoader ready for training
+    logging.info("Read STSbenchmark train dataset")
+
+    train_samples = []
+    dev_samples = []
+    test_samples = []
+    with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+        for row in reader:
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+            if row["split"] == "dev":
+                dev_samples.append(inp_example)
+            elif row["split"] == "test":
+                test_samples.append(inp_example)
+            else:
+                train_samples.append(inp_example)
+
+    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+    train_loss = losses.CosineSimilarityLoss(model=model)
+
+    logging.info("Read STSbenchmark dev dataset")
+    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+    # Configure the training. We skip evaluation in this example
+    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+
+    # Stopping and Evaluating after 30% of training data (less than 1 epoch)
+    # We find from (Dodge et al.) that 20-30% is often ideal for convergence of random seed
+    steps_per_epoch = math.ceil(len(train_dataloader) * stop_after)
+
+    logging.info("Warmup-steps: {}".format(warmup_steps))
+
+    logging.info("Early-stopping: {}% of the training-data".format(int(stop_after * 100)))
+
+    # Train the model
+    model.fit(
+        train_objectives=[(train_dataloader, train_loss)],
+        evaluator=evaluator,
+        epochs=num_epochs,
+        steps_per_epoch=steps_per_epoch,
+        evaluation_steps=1000,
+        warmup_steps=warmup_steps,
+        output_path=model_save_path,
+    )
--- a/examples/training/datasets/README.md
+++ b/examples/training/datasets/README.md
+# Training Datasets
+
+Most dataset configurations will take one of four forms:
+
+- **Case 1**: The example is a pair of sentences and a label indicating how similar they are. The label can be either an integer or a float. This case applies to datasets originally prepared for Natural Language Inference (NLI), since they contain pairs of sentences with a label indicating whether they infer each other or not.
+   **Case Example:** [SNLI](https://huggingface.co/datasets/snli).
+- **Case 2**: The example is a pair of positive (similar) sentences **without** a label. For example, pairs of paraphrases, pairs of full texts and their summaries, pairs of duplicate questions, pairs of (`query`, `response`), or pairs of (`source_language`, `target_language`). Natural Language Inference datasets can also be formatted this way by pairing entailing sentences.
+   **Case Examples:** [Sentence Compression](https://huggingface.co/datasets/embedding-data/sentence-compression), [COCO Captions](https://huggingface.co/datasets/embedding-data/coco_captions_quintets), [Flickr30k captions](https://huggingface.co/datasets/embedding-data/flickr30k_captions_quintets).
+- **Case 3**: The example is a sentence with an integer label indicating the class to which it belongs. This data format is easily converted by loss functions into three sentences (triplets) where the first is an "anchor", the second a "positive" of the same class as the anchor, and the third a "negative" of a different class.
+   **Case Examples:** [TREC](https://huggingface.co/datasets/trec), [Yahoo Answers Topics](https://huggingface.co/datasets/yahoo_answers_topics).
+- **Case 4**: The example is a triplet (anchor, positive, negative) without classes or labels for the sentences.
+   **Case Example:** [Quora Triplets](https://huggingface.co/datasets/embedding-data/QQP_triplets)
+
+Note that Sentence Transformers models can be trained with human labeling (cases 1 and 3) or with labels automatically deduced from text formatting (cases 2 and 4).
+
+You can get almost ready-to-train datasets from various sources. One of them is the Hugging Face Hub.
+
+## Datasets on the Hugging Face Hub
+
+The [Datasets library](https://huggingface.co/docs/datasets/index) (`pip install datasets`) allows you to load datasets from the Hugging Face Hub with the `load_dataset` function:
+
+```python
+from datasets import load_dataset
+
+# Indicate the repo id from the Hub
+dataset_id = "embedding-data/QQP_triplets"
+
+dataset = load_dataset(dataset_id)
+```
+
+For more information on how to manipulate your dataset see [» Datasets Documentation](https://huggingface.co/docs/datasets/access).
+
+These are popular datasets used to train and fine-tune SentenceTransformers models.
+
+|   | Dataset                                                                                                   |
+| - | --------------------------------------------------------------------------------------------------------- |
+|   | [altlex pairs](https://huggingface.co/datasets/embedding-data/altlex)                                     |
+|   | [sentence compression pairs](https://huggingface.co/datasets/embedding-data/sentence-compression)         |
+|   | [QQP triplets](https://huggingface.co/datasets/embedding-data/QQP_triplets)                               |
+|   | [PAQ pairs](https://huggingface.co/datasets/embedding-data/PAQ_pairs)                                     |
+|   | [SPECTER triplets](https://huggingface.co/datasets/embedding-data/SPECTER)                                |
+|   | [Amazon QA pairs](https://huggingface.co/datasets/embedding-data/Amazon-QA)                               |
+|   | [Simple Wiki pairs](https://huggingface.co/datasets/embedding-data/simple-wiki)                           |
+|   | [Wiki Answers equivalent sentences](https://huggingface.co/datasets/embedding-data/WikiAnswers)           |
+|   | [COCO Captions quintets](https://huggingface.co/datasets/embedding-data/coco_captions_quintets)           |
+|   | [Flickr30k Captions quintets](https://huggingface.co/datasets/embedding-data/flickr30k_captions_quintets) |
+|   | [MS Marco](https://huggingface.co/datasets/ms_marco)                                                      |
+|   | [GOOAQ](https://huggingface.co/datasets/gooaq)                                                            |
+|   | [MS Marco](https://huggingface.co/datasets/ms_marco)                                                      |
+|   | [Yahoo Answers topics](https://huggingface.co/datasets/yahoo_answers_topics)                              |
+|   | [Search QA](https://huggingface.co/datasets/search_qa)                                                    |
+|   | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml )             |
+|   | [ELI5](https://huggingface.co/datasets/eli5)                                                              |
+|   | [MultiNLI](https://huggingface.co/datasets/multi_nli)                                                     |
+|   | [SNLI](https://huggingface.co/datasets/snli)                                                              |
+|   | [S2ORC](https://huggingface.co/datasets/s2orc)                                                            |
+|   | [Trivia QA](https://huggingface.co/datasets/trivia_qa)                                                    |
+|   | [Code Search Net](https://huggingface.co/datasets/code_search_net)                                        |
+|   | [Natural Questions](https://huggingface.co/datasets/natural_questions)                                    |
--- a/examples/training/distillation/README.md
+++ b/examples/training/distillation/README.md
+# Model Distillation 
+This folder contains example to make SentenceTransformer models **faster, cheaper and lighter**. These light models achieve 97.5% - 100% performance of the original model on downstream tasks.
+
+## Knowledge Distillation
+See: **[model_distillation.py](model_distillation.py)**
+
+Knowledge distillation describes the process to transfer knowledge from a  teacher model to a student model. It can be used to extend sentence embeddings to new languages ([Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813)), but the traditional approach is to have slow (but well performing) teacher model and a fast student model.
+
+The fast student model imitates the teacher model and achieves by this a high performance. 
+
+![Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/monolingual-distillation.png)
+
+
+**[model_distillation.py](model_distillation.py)** implements two options for creating the student model:
+1) Use a light transformer model like TinyBERT or BERT-Small to imitate the teacher.
+2) We take the teacher model and keep only certain layers, for example, only 4 layers.
+
+Option 2) works usually better, as we keep most of the weights from the teacher. In Option 1, we have to tune all
+weights in the student from scratch.
+
+## Speed - Performance Trade-Off
+Smaller models are faster, but show a (slightly) worse performance when evaluated on down stream tasks. To get an impression of this trade-off, we show some numbers of the *stsb-roberta-base* model with different number of layers:
+
+| Layers | STSbenchmark Performance | Performance Decrease |Speed (Sent. / Sec. on V100-GPU) |
+| ---- |:----:|:----:|:----:|
+| teacher: 12 | 85.44 | - | 2300 |
+| 8 | 85.54 | +0.1% | 3200 |
+| 6 | 85.23 | -0.2% | 4000 |
+| 4 | 84.92 | -0.6% | 5300 |
+| 3 |  84.39 | -1.2%  |6500 |
+| 2 | 83.32 | -2.5% | 7700 |
+| 1 | 80.86 |  -5.4%| 9200 |
+
+
+## Dimensionality Reduction
+By default, the pretrained models output embeddings with size 768 (base-models) or with size 1024 (large-models). However, when you store Millions of embeddings, this can require quite a lot of memory / storage.
+
+**[dimensionality_reduction.py](dimensionality_reduction.py)** contains a simple example how to reduce the embedding dimension to any size by using Principle Component Analysis (PCA). In that example, we reduce 768 dimension to 128 dimension, reducing the storage requirement by factor 6. The performance only slightly drops from 85.44 to 84.96 on the STS benchmark dataset.
+
+This dimensionality reduction technique can easily be applied to existent models. We could even reduce the embeddings size to 32, reducing the storage requirement by factor 24 (performance decreases to 81.82). 
+
+Note: This technique neither improves the runtime, nor the memory requirement for running the model. It only reduces the needed space to store embeddings, for example, for [semantic search](../../applications/semantic-search/README.md).
+
+## Quantization
+A [quantized model](https://pytorch.org/docs/stable/quantization.html) executes some or all of the operations with integers rather than floating point values. This allows for a more compact models and the use of high performance vectorized operations on many hardware platforms.
+
+For models that are run on **CPUs**, this can yield 40% smaller models and a faster inference time: Depending on the CPU, speedup are between 15% and 400%. Model quantization is (as of now) not supported for GPUs by PyTorch.
+
+For an example, see [model_quantization.py](model_quantization.py)
--- a/examples/training/distillation/dimensionality_reduction.py
+++ b/examples/training/distillation/dimensionality_reduction.py
+"""
+The pre-trained models produce embeddings of size 512 - 1024. However, when storing a large
+number of embeddings, this requires quite a lot of memory / storage.
+
+In this example, we reduce the dimensionality of the embeddings to e.g. 128 dimensions. This significantly
+reduces the required memory / storage while maintaining nearly the same performance.
+
+For dimensionality reduction, we compute embeddings for a large set of (representative) sentence. Then,
+we use PCA to find e.g. 128 principle components of our vector space. This allows us to maintain
+us much information as possible with only 128 dimensions.
+
+PCA gives us a matrix that down-projects vectors to 128 dimensions. We use this matrix
+and extend our original SentenceTransformer model with this linear downproject. Hence,
+the new SentenceTransformer model will produce directly embeddings with 128 dimensions
+without further changes needed.
+"""
+
+from sklearn.decomposition import PCA
+from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
+import logging
+import os
+import gzip
+import csv
+import random
+import numpy as np
+import torch
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+# Model for which we apply dimensionality reduction
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# New size for the embeddings
+new_dimension = 128
+
+
+# We use AllNLI as a source of sentences to compute PCA
+nli_dataset_path = "datasets/AllNLI.tsv.gz"
+
+# We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# We measure the performance of the original model
+# and later we will measure the performance with the reduces dimension size
+logger.info("Read STSbenchmark test dataset")
+eval_examples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "test":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+# Evaluate the original model on the STS benchmark dataset
+stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test")
+
+logger.info("Original model performance:")
+stsb_evaluator(model)
+
+######## Reduce the embedding dimensions ########
+
+# Read sentences from NLI dataset
+nli_sentences = set()
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        nli_sentences.add(row["sentence1"])
+        nli_sentences.add(row["sentence2"])
+
+nli_sentences = list(nli_sentences)
+random.shuffle(nli_sentences)
+
+# To determine the PCA matrix, we need some example sentence embeddings.
+# Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
+pca_train_sentences = nli_sentences[0:20000]
+train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)
+
+# Compute PCA on the train embeddings matrix
+pca = PCA(n_components=new_dimension)
+pca.fit(train_embeddings)
+pca_comp = np.asarray(pca.components_)
+
+# We add a dense layer to the model, so that it will produce directly embeddings with the new size
+dense = models.Dense(
+    in_features=model.get_sentence_embedding_dimension(),
+    out_features=new_dimension,
+    bias=False,
+    activation_function=torch.nn.Identity(),
+)
+dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
+model.add_module("dense", dense)
+
+# Evaluate the model with the reduce embedding size
+logger.info("Model with {} dimensions:".format(new_dimension))
+stsb_evaluator(model)
+
+
+# If you like, you can store the model on disc by uncommenting the following line
+# model.save('models/my-128dim-model')
+
+# You can then load the adapted model that produces 128 dimensional embeddings like this:
+# model = SentenceTransformer('models/my-128dim-model')
--- a/examples/training/distillation/model_distillation.py
+++ b/examples/training/distillation/model_distillation.py
+"""
+This file contains an example how to make a SentenceTransformer model faster and lighter.
+
+This is achieved by using Knowledge Distillation: We use a well working teacher model to train
+a fast and light student model. The student model learns to imitate the produced
+sentence embeddings from the teacher. We train this on a diverse set of sentences we got
+from SNLI + Multi+NLI + Wikipedia.
+
+After the distillation is finished, the student model produce nearly the same embeddings as the
+teacher, however, it will be much faster.
+
+The script implements to options two options to initialize the student:
+Option 1: Train a light transformer model like TinyBERT to imitate the teacher
+Option 2: We take the teacher model and keep only certain layers, for example, only 4 layers.
+
+Option 2) works usually better, as we keep most of the weights from the teacher. In Option 1, we have to tune all
+weights in the student from scratch.
+
+There is a performance - speed trade-off. However, we found that a student with 4 instead of 12 layers keeps about 99.4%
+of the teacher performance, while being 2.3 times faster.
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import models, losses, evaluation
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.datasets import ParallelSentencesDataset
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+import random
+from sklearn.decomposition import PCA
+import torch
+
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Teacher Model: Model we want to distill to a smaller model
+teacher_model_name = "stsb-roberta-base-v2"
+teacher_model = SentenceTransformer(teacher_model_name)
+
+output_path = "output/model-distillation-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+use_layer_reduction = True
+
+# There are two options to create a light and fast student model:
+if use_layer_reduction:
+    # 1) Create a smaller student model by using only some of the teacher layers
+    student_model = SentenceTransformer(teacher_model_name)
+
+    # Get the transformer model
+    auto_model = student_model._first_module().auto_model
+
+    # Which layers to keep from the teacher model. We equally spread the layers to keep over the original teacher
+    # layers_to_keep = [5]
+    # layers_to_keep = [3, 7]
+    # layers_to_keep = [3, 7, 11]
+    layers_to_keep = [1, 4, 7, 10]  # Keep 4 layers from the teacher
+    # layers_to_keep = [0, 2, 4, 6, 8, 10]
+    # layers_to_keep = [0, 1, 3, 4, 6, 7, 9, 10]
+
+    logging.info("Remove layers from student. Only keep these layers: {}".format(layers_to_keep))
+    new_layers = torch.nn.ModuleList(
+        [layer_module for i, layer_module in enumerate(auto_model.encoder.layer) if i in layers_to_keep]
+    )
+    auto_model.encoder.layer = new_layers
+    auto_model.config.num_hidden_layers = len(layers_to_keep)
+else:
+    # 2) The other option is to train a small model like TinyBERT to imitate the teacher.
+    # You can find some small BERT models here: https://huggingface.co/nreimers
+    word_embedding_model = models.Transformer("nreimers/TinyBERT_L-4_H-312_v2")
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+    student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+inference_batch_size = 64
+train_batch_size = 64
+
+
+# We use AllNLI as a source of sentences for the distillation
+nli_dataset_path = "datasets/AllNLI.tsv.gz"
+
+# Further, we use sentences extracted from the English Wikipedia to train the distillation
+wikipedia_dataset_path = "datasets/wikipedia-en-sentences.txt.gz"
+
+# We use the STS benchmark dataset to see how much performance we loose
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+
+# Download datasets if needed
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get("https://sbert.net/datasets/wikipedia-en-sentences.txt.gz", wikipedia_dataset_path)
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+# We need sentences to train our distillation. Here, we use sentences from AllNLI and from WikiPedia
+train_sentences_nli = set()
+dev_sentences_nli = set()
+
+train_sentences_wikipedia = []
+dev_sentences_wikipedia = []
+
+# Read ALLNLI
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "dev":
+            dev_sentences_nli.add(row["sentence1"])
+            dev_sentences_nli.add(row["sentence2"])
+        else:
+            train_sentences_nli.add(row["sentence1"])
+            train_sentences_nli.add(row["sentence2"])
+
+train_sentences_nli = list(train_sentences_nli)
+random.shuffle(train_sentences_nli)
+
+dev_sentences_nli = list(dev_sentences_nli)
+random.shuffle(dev_sentences_nli)
+dev_sentences_nli = dev_sentences_nli[0:5000]  # Limit dev sentences to 5k
+
+# Read Wikipedia sentences file
+with gzip.open(wikipedia_dataset_path, "rt", encoding="utf8") as fIn:
+    wikipeda_sentences = [line.strip() for line in fIn]
+
+dev_sentences_wikipedia = wikipeda_sentences[
+    0:5000
+]  # Use the first 5k sentences from the wikipedia file for development
+train_sentences_wikipedia = wikipeda_sentences[5000:]
+
+
+# We use the STS benchmark dataset to measure the performance of student model im comparison to the teacher model
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "dev":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator_sts = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+logging.info("Teacher Performance:")
+dev_evaluator_sts(teacher_model)
+
+# Student model has fewer dimensions. Compute PCA for the teacher to reduce the dimensions
+if student_model.get_sentence_embedding_dimension() < teacher_model.get_sentence_embedding_dimension():
+    logging.info("Student model has fewer dimensions than the teacher. Compute PCA for down projection")
+    pca_sentences = train_sentences_nli[0:20000] + train_sentences_wikipedia[0:20000]
+    pca_embeddings = teacher_model.encode(pca_sentences, convert_to_numpy=True)
+    pca = PCA(n_components=student_model.get_sentence_embedding_dimension())
+    pca.fit(pca_embeddings)
+
+    # Add Dense layer to teacher that projects the embeddings down to the student embedding size
+    dense = models.Dense(
+        in_features=teacher_model.get_sentence_embedding_dimension(),
+        out_features=student_model.get_sentence_embedding_dimension(),
+        bias=False,
+        activation_function=torch.nn.Identity(),
+    )
+    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_))
+    teacher_model.add_module("dense", dense)
+
+    logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension()))
+    dev_evaluator_sts(teacher_model)
+
+
+# We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model
+# For this, we need a large set of sentences. These sentences are embedded using the teacher model,
+# and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813
+train_data = ParallelSentencesDataset(
+    student_model=student_model,
+    teacher_model=teacher_model,
+    batch_size=inference_batch_size,
+    use_embedding_cache=False,
+)
+train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256)
+train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256)
+
+train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MSELoss(model=student_model)
+
+# We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings
+dev_sentences = dev_sentences_nli + dev_sentences_wikipedia
+dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model)
+
+# Train the student model to imitate the teacher
+student_model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]),
+    epochs=1,
+    warmup_steps=1000,
+    evaluation_steps=5000,
+    output_path=output_path,
+    save_best_model=True,
+    optimizer_params={"lr": 1e-4, "eps": 1e-6, "correct_bias": False},
+    use_amp=True,
+)
--- a/examples/training/distillation/model_quantization.py
+++ b/examples/training/distillation/model_quantization.py
+"""
+A quantized model executes some or all of the operations with integers rather than floating point values. This allows for a more compact models and the use of high performance vectorized operations on many hardware platforms.
+
+As a result, you get about 40% smaller and faster models. The speed-up depends on your CPU and how PyTorch was build and can be anywhere between 10% speed-up and 300% speed-up.
+
+Note: Quantized models are only available for CPUs. Use a GPU, if available, for optimal performance.
+
+For more details:
+https://pytorch.org/docs/stable/quantization.html
+"""
+
+import logging
+import os
+import torch
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from torch.nn import Embedding, Linear
+from torch.quantization import quantize_dynamic
+import gzip
+import csv
+import time
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+# Limit torch to 4 threads
+torch.set_num_threads(4)
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
+### /print debug information to stdout
+
+model_name = "all-distilroberta-v1"
+
+# Load a named sentence model (based on BERT). This will download the model from our server.
+# Alternatively, you can also pass a filepath to SentenceTransformer()
+model = SentenceTransformer(model_name, device="cpu")
+q_model = quantize_dynamic(model, {Linear, Embedding})
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark dataset")
+test_samples = []
+sentences = []
+
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        sentences.append(row["sentence1"])
+        sentences.append(row["sentence2"])
+
+        if row["split"] == "test":
+            test_samples.append(inp_example)
+
+sentences = sentences[0:10000]
+
+logging.info("Evaluating speed of unquantized model")
+start_time = time.time()
+emb = model.encode(sentences, show_progress_bar=True)
+diff_normal = time.time() - start_time
+logging.info("Done after {:.2f} sec. {:.2f} sentences / sec".format(diff_normal, len(sentences) / diff_normal))
+
+logging.info("Evaluating speed of quantized model")
+start_time = time.time()
+emb = q_model.encode(sentences, show_progress_bar=True)
+diff_quantized = time.time() - start_time
+logging.info("Done after {:.2f} sec. {:.2f} sentences / sec".format(diff_quantized, len(sentences) / diff_quantized))
+logging.info("Speed-up: {:.2f}".format(diff_normal / diff_quantized))
+#########
+
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+
+logging.info("Evaluate regular model")
+model.evaluate(evaluator)
+
+print("\n\n")
+logging.info("Evaluate quantized model")
+q_model.evaluate(evaluator)
--- a/examples/training/matryoshka/2d_matryoshka_nli.py
+++ b/examples/training/matryoshka/2d_matryoshka_nli.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
+Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
+At every 10% training steps, the model is evaluated on the STS benchmark dataset
+
+Usage:
+python 2d_matryoshka_nli.py
+
+OR
+python 2d_matryoshka_nli.py pretrained_transformer_model_name
+"""
+
+import math
+from datasets import load_dataset
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
+train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
+max_seq_length = 75
+num_epochs = 1
+
+# Save path of the model
+model_save_path = (
+    "output/2d_matryoshka_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+
+def add_to_samples(sent1, sent2, label):
+    if sent1 not in train_data:
+        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+    train_data[sent1][label].add(sent2)
+
+
+train_data = {}
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            sent1 = row["sentence1"].strip()
+            sent2 = row["sentence2"].strip()
+
+            add_to_samples(sent1, sent2, row["label"])
+            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
+
+
+train_samples = []
+for sent1, others in train_data.items():
+    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+        train_samples.append(
+            InputExample(
+                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
+            )
+        )
+        train_samples.append(
+            InputExample(
+                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
+            )
+        )
+
+logging.info("Train samples: {}".format(len(train_samples)))
+
+
+# Special data loader that avoid duplicates within a batch
+train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
+
+
+# Our training loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])
+
+stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_dev["sentence1"],
+    stsb_dev["sentence2"],
+    [score / 5 for score in stsb_dev["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=int(len(train_dataloader) * 0.1),
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 operations
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_test["sentence1"],
+    stsb_test["sentence2"],
+    [score / 5 for score in stsb_test["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
+)
+test_evaluator(model, output_path=model_save_path)
+
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-2d-matryoshka")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-2d-matryoshka')`."
+    )
--- a/examples/training/matryoshka/2d_matryoshka_sts.py
+++ b/examples/training/matryoshka/2d_matryoshka_sts.py
+"""
+This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch.
+It uses MatryoshkaLoss with the powerful CoSENTLoss to train models that perform well at output dimensions [768, 512, 256, 128, 64].
+It generates sentence embeddings that can be compared using cosine-similarity to measure the similarity.
+
+Usage:
+python 2d_matryoshka_sts.py
+
+OR
+python 2d_matryoshka_sts.py pretrained_transformer_model_name
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
+
+# Read the dataset
+train_batch_size = 16
+num_epochs = 4
+model_save_path = (
+    "output/2d_matryoshka_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.CoSENTLoss(model=model)
+train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])
+
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training. We skip evaluation in this example
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-sts-2d-matryoshka")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-sts-2d-matryoshka')`."
+    )
--- a/examples/training/matryoshka/README.md
+++ b/examples/training/matryoshka/README.md
+# Matryoshka Embeddings
+
+Dense embedding models typically produce embeddings with a fixed size, such as 768 or 1024. All further computations (clustering, classification, semantic search, retrieval, reranking, etc.) must then be done on these full embeddings. [Matryoshka Representation Learning](https://arxiv.org/abs/2205.13147) revisits this idea, and proposes a solution to train embedding models whose embeddings are still useful after truncation to much smaller sizes. This allows for considerably faster (bulk) processing.
+
+## Use Cases
+
+A particularly interesting use case is to split up processing into two steps: 1) pre-processing with much smaller vectors and then 2) processing the remaining vectors as full size (also called "shortlisting and reranking"). Additionally, Matryoshka models will allow you to scale your embedding solutions to your desired storage cost, processing speed and performance.
+
+## Results
+
+Let's look at the actual performance that we may be able to expect from a Matryoshka embedding model versus a regular embedding model. For this experiment, I have trained two models:
+
+* [tomaarsen/mpnet-base-nli-matryoshka](https://huggingface.co/tomaarsen/mpnet-base-nli-matryoshka): Trained by running [matryoshka_nli.py](matryoshka_nli.py) with [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base).
+* [tomaarsen/mpnet-base-nli](https://huggingface.co/tomaarsen/mpnet-base-nli): Trained by running a modified version of [matryoshka_nli.py](matryoshka_nli.py) where the training loss is only `MultipleNegativesRankingLoss` rather than `MatryoshkaLoss` on top of `MultipleNegativesRankingLoss`. I also use [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base) as the base model.
+
+Both of these models were trained on the AllNLI dataset, which is a concatenation of the [SNLI](https://huggingface.co/datasets/snli) and [MultiNLI](https://huggingface.co/datasets/multi_nli) datasets. I have evaluated these models on the [STSBenchmark](https://huggingface.co/datasets/mteb/stsbenchmark-sts) test set using multiple different embedding dimensions. The results, obtained by running [matryoshka_eval_stsb.py](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/matryoshka/matryoshka_eval_stsb.py), are plotted in the following figure:
+
+![results](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/matryoshka/results.png)
+
+In the top figure, you can see that the Matryoshka model reaches a higher Spearman similarity than the standard model at all dimensionalities, indicative that the Matryoshka model is superior in this task.
+
+Furthermore, the performance of the Matryoshka model falls off much less quickly than the standard model. This is shown clearly in the second figure, which shows the performance at the embedding dimension relative to the maximum performance. **Even at 8.3% of the embedding size, the Matryoshka model preserves 98.37% of the performance**, much higher than the 96.46% by the standard model.
+
+These findings are indicative that truncating embeddings by a Matryoshka model could: 1) significantly speed up downstream tasks such as retrieval and 2) significantly save on storage space, all without a notable hit in performance.
+
+## Training
+
+Training using Matryoshka Representation Learning (MRL) is quite elementary: rather than applying some loss function on only the full-size embeddings, we also apply that same loss function on truncated portions of the embeddings. For example, if a model has an embedding dimension of 768 by default, it can now be trained on 768, 512, 256, 128, 64 and 32. Each of these losses will be added together, optionally with some weight:
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.losses import CoSENTLoss, MatryoshkaLoss
+
+model = SentenceTransformer("microsoft/mpnet-base")
+
+base_loss = CoSENTLoss(model=model)
+loss = MatryoshkaLoss(model=model, loss=base_loss, matryoshka_dims=[768, 512, 256, 128, 64])
+```
+* **Reference**: <a href="../../../docs/package_reference/losses.html#matryoshkaloss"><code>MatryoshkaLoss</code></a>
+
+Additionally, this can be combined with the `AdaptiveLayerLoss` such that the resulting model can be reduced both in the size of the output dimensions, but also in the number of layers for faster inference. See also the [Adaptive Layers](../adaptive_layer/README.html) for more information on reducing the number of model layers. In Sentence Transformers, the combination of these two losses is called `Matryoshka2dLoss`, and a shorthand is provided for simpler training.
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.losses import CoSENTLoss, Matryoshka2dLoss
+
+model = SentenceTransformer("microsoft/mpnet-base")
+
+base_loss = CoSENTLoss(model=model)
+loss = Matryoshka2dLoss(model=model, loss=base_loss, matryoshka_dims=[768, 512, 256, 128, 64])
+```
+
+* **Reference**: <a href="../../../docs/package_reference/losses.html#matryoshka2dloss"><code>Matryoshka2dLoss</code></a>
+
+## Inference
+
+After a model has been trained using a Matryoshka loss, you can then run inference with it using <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformers.encode</code></a>.
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+import torch.nn.functional as F
+
+matryoshka_dim = 64
+model = SentenceTransformer(
+    "nomic-ai/nomic-embed-text-v1.5",
+    trust_remote_code=True,
+    truncate_dim=matryoshka_dim,
+)
+
+embeddings = model.encode(
+    [
+        "search_query: What is TSNE?",
+        "search_document: t-distributed stochastic neighbor embedding (t-SNE) is a statistical method for visualizing high-dimensional data by giving each datapoint a location in a two or three-dimensional map.",
+        "search_document: Amelia Mary Earhart was an American aviation pioneer and writer.",
+    ]
+)
+assert embeddings.shape[-1] == matryoshka_dim
+
+similarities = cos_sim(embeddings[0], embeddings[1:])
+# => tensor([[0.7839, 0.4933]])
+```
+As you can see, the similarity between the search query and the correct document is much higher than that of an unrelated document, despite the very small matryoshka dimension applied. Feel free to copy this script locally, modify the `matryoshka_dim`, and observe the difference in similarities.
+
+**Note**: Despite the embeddings being smaller, training and inference of a Matryoshka model is not faster, not more memory-efficient, and not smaller. Only the processing and storage of the resulting embeddings will be faster and cheaper.
+
+## Code Examples
+
+See the following scripts as examples of how to apply the <a href="../../../docs/package_reference/losses.html#matryoshkaloss"><code>MatryoshkaLoss</code></a> in practice:
+
+* **[matryoshka_nli.py](matryoshka_nli.py)**: This example uses the MultipleNegativesRankingLoss with MatryoshkaLoss to train a strong embedding model using Natural Language Inference (NLI) data. It is an adaptation of the [NLI](../nli/README) documentation.
+* **[matryoshka_nli_reduced_dim.py](matryoshka_nli_reduced_dim.py)**: This example uses the MultipleNegativesRankingLoss with MatryoshkaLoss to train a strong embedding model with a small maximum output dimension of 256. It trains using Natural Language Inference (NLI) data, and is an adaptation of the [NLI](../nli/README) documentation.
+* **[matryoshka_eval_stsb.py](matryoshka_eval_stsb.py)**: This example evaluates the embedding model trained with MatryoshkaLoss in [matryoshka_nli.py](matryoshka_nli.py) on the test set of the STSBenchmark dataset, and compares it to a non-Matryoshka trained model.
+* **[matryoshka_sts.py](matryoshka_sts.py)**: This example uses the CoSENTLoss with MatryoshkaLoss to train an embedding model on the training set of the STSBenchmark dataset. It is an adaptation of the [STS](../sts/README) documentation.
+
+And the following scripts to see how to apply <a href="../../../docs/package_reference/losses.html#matryoshka2dloss"><code>Matryoshka2dLoss</code></a>:
+* **[2d_matryoshka_nli.py](2d_matryoshka_nli.py)**: This example uses the `MultipleNegativesRankingLoss` with `Matryoshka2dLoss` to train a strong embedding model using Natural Language Inference (NLI) data. It is an adaptation of the [NLI](../nli/README) documentation.
+* **[2d_matryoshka_sts.py](2d_matryoshka_sts.py)**: This example uses the `CoSENTLoss` with `Matryoshka2dLoss` to train an embedding model on the training set of the STSBenchmark dataset. It is an adaptation of the [STS](../sts/README) documentation.
--- a/examples/training/matryoshka/matryoshka_eval_stsb.py
+++ b/examples/training/matryoshka/matryoshka_eval_stsb.py
+"""
+This script evaluates embedding models truncated at different dimensions on the STS
+benchmark.
+"""
+
+import argparse
+import os
+from typing import Dict, List, Optional, Tuple, cast
+
+from datasets import load_dataset
+import numpy as np
+import matplotlib.pyplot as plt
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.evaluation import (
+    EmbeddingSimilarityEvaluator,
+    SimilarityFunction,
+)
+from tqdm.auto import tqdm
+
+
+# Dimension plot
+def _grouped_barplot_ratios(
+    group_name_to_x_to_y: Dict[str, Dict[int, float]], ax: Optional[plt.Axes] = None
+) -> plt.Axes:
+    # To save a pandas dependency, do from scratch in matplotlib
+    if ax is None:
+        ax: plt.Axes = plt.subplots()
+    # Sort each by x
+    group_name_to_x_to_y = {
+        group_name: dict(sorted(x_to_y.items(), key=lambda x: x[0]))
+        for group_name, x_to_y in group_name_to_x_to_y.items()
+    }
+    # Check that all x are the same
+    xticks = None
+    for group_name, x_to_y in group_name_to_x_to_y.items():
+        _xticks = x_to_y.keys()
+        if xticks is not None and _xticks != xticks:
+            raise ValueError(f"{group_name} has different keys: {_xticks}")
+        xticks = _xticks
+    xticks = sorted(xticks)
+
+    # Max y will be the denominator in the ratio/fraction
+    group_name_to_max_y = {group_name: max(x_to_y.values()) for group_name, x_to_y in group_name_to_x_to_y.items()}
+    num_groups = len(group_name_to_x_to_y)
+    bar_width = np.diff(xticks).min() / (num_groups + 1)
+    # bar_width is the solution to this equation:
+    # Say we have the closest x1, x2 st x1 < x2, so x2 - x1 = np.diff(xticks).min().
+    # (x2 - (bar_width * num_groups/2)) - (x1 + (bar_width * num_groups/2)) = bar_width
+    xs = np.array(
+        [
+            np.linspace(
+                start=xtick - ((bar_width / 2) * (num_groups - 1)),
+                stop=xtick + ((bar_width / 2) * (num_groups - 1)),
+                num=num_groups,
+            )
+            for xtick in xticks
+        ]
+    ).T
+    # xs are the center of where the bar goes on the x axis. They have to be manually set
+    min_ratio = np.inf
+    for i, (group_name, x_to_y) in enumerate(group_name_to_x_to_y.items()):
+        max_y = group_name_to_max_y[group_name]
+        ys = [y / max_y for y in x_to_y.values()]
+        min_ratio = min(min_ratio, min(ys))
+        ax.bar(xs[i], ys, bar_width, label=group_name)
+    ax.set_xticks(xticks)
+    ax.set_xticklabels(xticks)
+    ax.grid(linestyle="--")
+    ax.set_ylim(min(0.95, min_ratio), 1)
+    return ax
+
+
+def plot_across_dimensions(
+    model_name_to_dim_to_score: Dict[str, Dict[int, float]],
+    filename: str,
+    figsize: Tuple[float, float] = (7, 7),
+    title: str = "STSB test score for various embedding dimensions (via truncation),\nwith and without Matryoshka loss",
+) -> None:
+    # Sort each by key
+    model_name_to_dim_to_score = {
+        model_name: dict(sorted(dim_to_score.items(), key=lambda x: x[0]))
+        for model_name, dim_to_score in model_name_to_dim_to_score.items()
+    }
+    xticks = sorted(list(model_name_to_dim_to_score.values())[0].keys())
+
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
+    ax1 = cast(plt.Axes, ax1)
+    ax2 = cast(plt.Axes, ax2)
+
+    # Line plot
+    for model_name, dim_to_score in model_name_to_dim_to_score.items():
+        ax1.plot(dim_to_score.keys(), dim_to_score.values(), label=model_name)
+    ax1.set_xticks(xticks)
+    ax1.set_ylabel("Spearman correlation")
+    ax1.grid(linestyle="--")
+    ax1.legend()
+
+    # Bar plot
+    ax2 = _grouped_barplot_ratios(model_name_to_dim_to_score, ax=ax2)
+    ax2.set_xlabel("Embedding dimension")
+    ax2.set_ylabel("Ratio of maximum performance")
+
+    fig.suptitle(title)
+    fig.tight_layout()
+    fig.savefig(filename)
+
+
+if __name__ == "__main__":
+    DEFAULT_MODEL_NAMES = [
+        "tomaarsen/mpnet-base-nli-matryoshka",  # fit using Matryoshka loss
+        "tomaarsen/mpnet-base-nli",  # baseline
+    ]
+    DEFAULT_DIMENSIONS = [768, 512, 256, 128, 64]
+
+    # Parse args
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("plot_filename", type=str, help="Where to save the plot of results")
+    parser.add_argument(
+        "--model_names",
+        nargs="+",
+        default=DEFAULT_MODEL_NAMES,
+        help=(
+            "List of models which can be loaded using "
+            "sentence_transformers.SentenceTransformer(). Default: "
+            f"{' '.join(DEFAULT_MODEL_NAMES)}"
+        ),
+    )
+    parser.add_argument(
+        "--dimensions",
+        nargs="+",
+        type=int,
+        default=DEFAULT_DIMENSIONS,
+        help=(
+            "List of dimensions to truncate to and evaluate. Default: "
+            f"{' '.join(str(dim) for dim in DEFAULT_DIMENSIONS)}"
+        ),
+    )
+
+    args = parser.parse_args()
+    plot_filename: str = args.plot_filename
+    model_names: List[str] = args.model_names
+    DIMENSIONS: List[int] = args.dimensions
+
+    # Load STSb
+    stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
+    test_evaluator = EmbeddingSimilarityEvaluator(
+        stsb_test["sentence1"],
+        stsb_test["sentence2"],
+        [score / 5 for score in stsb_test["score"]],
+        main_similarity=SimilarityFunction.COSINE,
+        name="sts-test",
+    )
+
+    # Run test_evaluator
+    model_name_to_dim_to_score: Dict[str, Dict[int, float]] = {}
+    for model_name in tqdm(model_names, desc="Evaluating models"):
+        model = SentenceTransformer(model_name)
+        dim_to_score: Dict[int, float] = {}
+        for dim in tqdm(DIMENSIONS, desc=f"Evaluating {model_name}"):
+            output_path = os.path.join(model_name, f"dim-{dim}")
+            os.makedirs(output_path)
+            with model.truncate_sentence_embeddings(dim):
+                score = test_evaluator(model, output_path=output_path)
+            print(f"Saved results to {output_path}")
+            dim_to_score[dim] = score
+        model_name_to_dim_to_score[model_name] = dim_to_score
+
+    # Save plot
+    plot_across_dimensions(model_name_to_dim_to_score, plot_filename)