First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/examples/training/nli/training_nli.py
+++ b/examples/training/nli/training_nli.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with softmax loss function. At every 1000 training steps, the model is evaluated on the
+STS benchmark dataset
+
+Usage:
+python training_nli.py
+
+OR
+python training_nli.py pretrained_transformer_model_name
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
+
+# Read the dataset
+train_batch_size = 16
+
+
+model_save_path = (
+    "output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
+train_samples = []
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            label_id = label2int[row["label"]]
+            train_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.SoftmaxLoss(
+    model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
+)
+
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "dev":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+
+# Configure the training
+num_epochs = 1
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "test":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/training/nli/training_nli_v2.py
+++ b/examples/training/nli/training_nli_v2.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with MultipleNegativesRankingLoss. Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
+At every 10% training steps, the model is evaluated on the STS benchmark dataset
+
+Usage:
+python training_nli_v2.py
+
+OR
+python training_nli_v2.py pretrained_transformer_model_name
+"""
+
+import math
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
+train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
+max_seq_length = 75
+num_epochs = 1
+
+# Save path of the model
+model_save_path = (
+    "output/training_nli_v2_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+
+def add_to_samples(sent1, sent2, label):
+    if sent1 not in train_data:
+        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+    train_data[sent1][label].add(sent2)
+
+
+train_data = {}
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            sent1 = row["sentence1"].strip()
+            sent2 = row["sentence2"].strip()
+
+            add_to_samples(sent1, sent2, row["label"])
+            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
+
+
+train_samples = []
+for sent1, others in train_data.items():
+    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+        train_samples.append(
+            InputExample(
+                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
+            )
+        )
+        train_samples.append(
+            InputExample(
+                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
+            )
+        )
+
+logging.info("Train samples: {}".format(len(train_samples)))
+
+
+# Special data loader that avoid duplicates within a batch
+train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
+
+
+# Our training loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "dev":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=int(len(train_dataloader) * 0.1),
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 operations
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "test":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/training/nli/training_nli_v3.py
+++ b/examples/training/nli/training_nli_v3.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with GISTEmbedLoss, using all-MiniLM-L6-v2 as an efficient guiding model. Entailments are positive pairs and the contradiction
+on AllNLI dataset is added as a hard negative. At every 10% training steps, the model is evaluated on the STS benchmark dataset
+
+Usage:
+python training_nli_v3.py
+
+OR
+python training_nli_v3.py pretrained_transformer_model_name
+"""
+
+import math
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
+train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
+max_seq_length = 75
+num_epochs = 1
+
+# Save path of the model
+model_save_path = (
+    "output/training_nli_v3_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+
+def add_to_samples(sent1, sent2, label):
+    if sent1 not in train_data:
+        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+    train_data[sent1][label].add(sent2)
+
+
+train_data = {}
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            sent1 = row["sentence1"].strip()
+            sent2 = row["sentence2"].strip()
+
+            add_to_samples(sent1, sent2, row["label"])
+            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
+
+
+train_samples = []
+for sent1, others in train_data.items():
+    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+        train_samples.append(
+            InputExample(
+                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
+            )
+        )
+        train_samples.append(
+            InputExample(
+                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
+            )
+        )
+
+logging.info("Train samples: {}".format(len(train_samples)))
+
+
+# Special data loader that avoid duplicates within a batch
+train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
+
+
+# The guiding model
+guide_model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Our training loss
+train_loss = losses.GISTEmbedLoss(model, guide_model)
+
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "dev":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=int(len(train_dataloader) * 0.1),
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 operations
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "test":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/training/other/training_batch_hard_trec.py
+++ b/examples/training/other/training_batch_hard_trec.py
+"""
+This script trains sentence transformers with a batch hard loss function.
+
+The TREC dataset will be automatically downloaded and put in the datasets/ directory
+
+Usual triplet loss takes 3 inputs: anchor, positive, negative and optimizes the network such that
+the positive sentence is closer to the anchor than the negative sentence. However, a challenge here is
+to select good triplets. If the negative sentence is selected randomly, the training objective is often
+too easy and the network fails to learn good representations.
+
+Batch hard triplet loss (https://arxiv.org/abs/1703.07737) creates triplets on the fly. It requires that the
+data is labeled (e.g. labels 1, 2, 3) and we assume that samples with the same label are similar:
+
+In a batch, it checks for sent1 with label 1 what is the other sentence with label 1 that is the furthest (hard positive)
+which sentence with another label is the closest (hard negative example). It then tries to optimize this, i.e.
+all sentences with the same label should be close and sentences for different labels should be clearly separated.
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
+from sentence_transformers.datasets import SentenceLabelDataset
+from torch.utils.data import DataLoader
+from sentence_transformers.readers import InputExample
+from sentence_transformers.evaluation import TripletEvaluator
+from datetime import datetime
+
+
+import logging
+import os
+import random
+from collections import defaultdict
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
+    handlers=[LoggingHandler()],
+)
+
+
+# Inspired from torchnlp
+def trec_dataset(
+    directory="datasets/trec/",
+    train_filename="train_5500.label",
+    test_filename="TREC_10.label",
+    validation_dataset_nb=500,
+    urls=[
+        "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label",
+        "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label",
+    ],
+):
+    os.makedirs(directory, exist_ok=True)
+
+    ret = []
+    for url, filename in zip(urls, [train_filename, test_filename]):
+        full_path = os.path.join(directory, filename)
+        if not os.path.exists(full_path):
+            util.http_get(url, full_path)
+
+        examples = []
+        label_map = {}
+        guid = 1
+        for line in open(full_path, "rb"):
+            # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
+            label, _, text = line.replace(b"\xf0", b" ").strip().decode().partition(" ")
+
+            if label not in label_map:
+                label_map[label] = len(label_map)
+
+            label_id = label_map[label]
+            guid += 1
+            examples.append(InputExample(guid=guid, texts=[text], label=label_id))
+        ret.append(examples)
+
+    train_set, test_set = ret
+    dev_set = None
+
+    # Create a dev set from train set
+    if validation_dataset_nb > 0:
+        dev_set = train_set[-validation_dataset_nb:]
+        train_set = train_set[:-validation_dataset_nb]
+
+    # For dev & test set, we return triplets (anchor, positive, negative)
+    random.seed(42)  # Fix seed, so that we always get the same triplets
+    dev_triplets = triplets_from_labeled_dataset(dev_set)
+    test_triplets = triplets_from_labeled_dataset(test_set)
+
+    return train_set, dev_triplets, test_triplets
+
+
+def triplets_from_labeled_dataset(input_examples):
+    # Create triplets for a [(label, sentence), (label, sentence)...] dataset
+    # by using each example as an anchor and selecting randomly a
+    # positive instance with the same label and a negative instance with a different label
+    triplets = []
+    label2sentence = defaultdict(list)
+    for inp_example in input_examples:
+        label2sentence[inp_example.label].append(inp_example)
+
+    for inp_example in input_examples:
+        anchor = inp_example
+
+        if len(label2sentence[inp_example.label]) < 2:  # We need at least 2 examples per label to create a triplet
+            continue
+
+        positive = None
+        while positive is None or positive.guid == anchor.guid:
+            positive = random.choice(label2sentence[inp_example.label])
+
+        negative = None
+        while negative is None or negative.label == anchor.label:
+            negative = random.choice(input_examples)
+
+        triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))
+
+    return triplets
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = "all-distilroberta-v1"
+
+### Create a torch.DataLoader that passes training batch instances to our model
+train_batch_size = 32
+output_path = "output/finetune-batch-hard-trec-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+num_epochs = 1
+
+logging.info("Loading TREC dataset")
+train_set, dev_set, test_set = trec_dataset()
+
+# We create a special dataset "SentenceLabelDataset" to wrap out train_set
+# It will yield batches that contain at least two samples with the same label
+train_data_sampler = SentenceLabelDataset(train_set)
+train_dataloader = DataLoader(train_data_sampler, batch_size=32, drop_last=True)
+
+
+# Load pretrained model
+logging.info("Load model")
+model = SentenceTransformer(model_name)
+
+
+### Triplet losses ####################
+### There are 4 triplet loss variants:
+### - BatchHardTripletLoss
+### - BatchHardSoftMarginTripletLoss
+### - BatchSemiHardTripletLoss
+### - BatchAllTripletLoss
+#######################################
+
+train_loss = losses.BatchAllTripletLoss(model=model)
+# train_loss = losses.BatchHardTripletLoss(model=model)
+# train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
+# train_loss = losses.BatchSemiHardTripletLoss(model=model)
+
+
+logging.info("Read TREC val dataset")
+dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name="trec-dev")
+
+logging.info("Performance before fine-tuning:")
+dev_evaluator(model)
+
+warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=output_path,
+)
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on TREC dataset
+#
+##############################################################################
+
+logging.info("Evaluating model on test set")
+test_evaluator = TripletEvaluator.from_input_examples(test_set, name="trec-test")
+model.evaluate(test_evaluator)
--- a/examples/training/other/training_multi-task.py
+++ b/examples/training/other/training_multi-task.py
+"""
+This is an example how to train SentenceTransformers in a multi-task setup.
+
+The system trains BERT on the AllNLI and on the STSbenchmark dataset.
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import gzip
+import csv
+import os
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Read the dataset
+model_name = "bert-base-uncased"
+batch_size = 16
+model_save_path = "output/training_multi-task_" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "datasets/AllNLI.tsv.gz"
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Use BERT for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read AllNLI train dataset")
+label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
+train_nli_samples = []
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            label_id = label2int[row["label"]]
+            train_nli_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
+
+
+train_dataloader_nli = DataLoader(train_nli_samples, shuffle=True, batch_size=batch_size)
+train_loss_nli = losses.SoftmaxLoss(
+    model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
+)
+
+logging.info("Read STSbenchmark train dataset")
+train_sts_samples = []
+dev_sts_samples = []
+test_sts_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_sts_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_sts_samples.append(inp_example)
+        else:
+            train_sts_samples.append(inp_example)
+
+
+train_dataloader_sts = DataLoader(train_sts_samples, shuffle=True, batch_size=batch_size)
+train_loss_sts = losses.CosineSimilarityLoss(model=model)
+
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_sts_samples, name="sts-dev")
+
+# Configure the training
+num_epochs = 4
+
+warmup_steps = math.ceil(len(train_dataloader_sts) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data)
+# and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data)
+# You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way.
+train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)]
+
+# Train the model
+model.fit(
+    train_objectives=train_objectives,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_sts_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
--- a/examples/training/other/training_wikipedia_sections.py
+++ b/examples/training/other/training_wikipedia_sections.py
+"""
+This script trains sentence transformers with a triplet loss function.
+
+As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
+"""
+
+from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
+from torch.utils.data import DataLoader
+from sentence_transformers.evaluation import TripletEvaluator
+from datetime import datetime
+from zipfile import ZipFile
+
+import csv
+import logging
+import os
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = "distilbert-base-uncased"
+
+dataset_path = "datasets/wikipedia-sections"
+if not os.path.exists(dataset_path):
+    os.makedirs(dataset_path, exist_ok=True)
+    filepath = os.path.join(dataset_path, "wikipedia-sections-triplets.zip")
+    util.http_get("https://sbert.net/datasets/wikipedia-sections-triplets.zip", filepath)
+    with ZipFile(filepath, "r") as zip:
+        zip.extractall(dataset_path)
+
+
+### Create a torch.DataLoader that passes training batch instances to our model
+train_batch_size = 16
+output_path = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+num_epochs = 1
+
+
+### Configure sentence transformers for training and train on the provided dataset
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+logger.info("Read Triplet train dataset")
+train_examples = []
+with open(os.path.join(dataset_path, "train.csv"), encoding="utf-8") as fIn:
+    reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+    for row in reader:
+        train_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]], label=0))
+
+
+train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.TripletLoss(model=model)
+
+logger.info("Read Wikipedia Triplet dev dataset")
+dev_examples = []
+with open(os.path.join(dataset_path, "validation.csv"), encoding="utf-8") as fIn:
+    reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+    for row in reader:
+        dev_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
+
+        if len(dev_examples) >= 1000:
+            break
+
+evaluator = TripletEvaluator.from_input_examples(dev_examples, name="dev")
+
+
+warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=output_path,
+)
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+logger.info("Read test examples")
+test_examples = []
+with open(os.path.join(dataset_path, "test.csv"), encoding="utf-8") as fIn:
+    reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+    for row in reader:
+        test_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
+
+
+model = SentenceTransformer(output_path)
+test_evaluator = TripletEvaluator.from_input_examples(test_examples, name="test")
+test_evaluator(model, output_path=output_path)
--- a/examples/training/paraphrases/MultiDatasetDataLoader.py
+++ b/examples/training/paraphrases/MultiDatasetDataLoader.py
+import math
+import logging
+import random
+
+
+class MultiDatasetDataLoader:
+    def __init__(self, datasets, batch_size_pairs, batch_size_triplets=None, dataset_size_temp=-1):
+        self.allow_swap = True
+        self.batch_size_pairs = batch_size_pairs
+        self.batch_size_triplets = batch_size_pairs if batch_size_triplets is None else batch_size_triplets
+
+        # Compute dataset weights
+        self.dataset_lengths = list(map(len, datasets))
+        self.dataset_lengths_sum = sum(self.dataset_lengths)
+
+        weights = []
+        if dataset_size_temp > 0:  # Scale probability with dataset size
+            for dataset in datasets:
+                prob = len(dataset) / self.dataset_lengths_sum
+                weights.append(max(1, int(math.pow(prob, 1 / dataset_size_temp) * 1000)))
+        else:  # Equal weighting of all datasets
+            weights = [100] * len(datasets)
+
+        logging.info("Dataset lengths and weights: {}".format(list(zip(self.dataset_lengths, weights))))
+
+        self.dataset_idx = []
+        self.dataset_idx_pointer = 0
+
+        for idx, weight in enumerate(weights):
+            self.dataset_idx.extend([idx] * weight)
+        random.shuffle(self.dataset_idx)
+
+        self.datasets = []
+        for dataset in datasets:
+            random.shuffle(dataset)
+            self.datasets.append(
+                {
+                    "elements": dataset,
+                    "pointer": 0,
+                }
+            )
+
+    def __iter__(self):
+        for _ in range(int(self.__len__())):
+            # Select dataset
+            if self.dataset_idx_pointer >= len(self.dataset_idx):
+                self.dataset_idx_pointer = 0
+                random.shuffle(self.dataset_idx)
+
+            dataset_idx = self.dataset_idx[self.dataset_idx_pointer]
+            self.dataset_idx_pointer += 1
+
+            # Select batch from this dataset
+            dataset = self.datasets[dataset_idx]
+            batch_size = self.batch_size_pairs if len(dataset["elements"][0].texts) == 2 else self.batch_size_triplets
+
+            batch = []
+            texts_in_batch = set()
+            guid_in_batch = set()
+            while len(batch) < batch_size:
+                example = dataset["elements"][dataset["pointer"]]
+
+                valid_example = True
+                # First check if one of the texts in already in the batch
+                for text in example.texts:
+                    text_norm = text.strip().lower()
+                    if text_norm in texts_in_batch:
+                        valid_example = False
+
+                    texts_in_batch.add(text_norm)
+
+                # If the example has a guid, check if guid is in batch
+                if example.guid is not None:
+                    valid_example = valid_example and example.guid not in guid_in_batch
+                    guid_in_batch.add(example.guid)
+
+                if valid_example:
+                    if self.allow_swap and random.random() > 0.5:
+                        example.texts[0], example.texts[1] = example.texts[1], example.texts[0]
+
+                    batch.append(example)
+
+                dataset["pointer"] += 1
+                if dataset["pointer"] >= len(dataset["elements"]):
+                    dataset["pointer"] = 0
+                    random.shuffle(dataset["elements"])
+
+            yield self.collate_fn(batch) if self.collate_fn is not None else batch
+
+    def __len__(self):
+        return int(self.dataset_lengths_sum / self.batch_size_pairs)
--- a/examples/training/paraphrases/README.md
+++ b/examples/training/paraphrases/README.md
+# Paraphrase Data
+
+**This page is currently work-in-progress and will be extended in the future**
+
+In our paper [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) we showed that paraphrase dataset together with [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) is a powerful combination to learn sentence embeddings models.
+
+You can find here: [NLI - MultipleNegativesRankingLoss](https://www.sbert.net/examples/training/nli/README.html#multiplenegativesrankingloss) more information how the loss can be used.
+
+In this folder, we collect different datasets and scripts to train using paraphrase data.
+
+## Datasets
+
+You can find here: [sbert.net/datasets/paraphrases](http://sbert.net/datasets/paraphrases) a list of datasets with paraphrases suitable for training.
+
+| Name | Source | #Sentence-Pairs | STSb-dev |
+| --- | --- | :---: | :---: |
+| [AllNLI.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/AllNLI.tsv.gz) | [SNLI](https://nlp.stanford.edu/projects/snli/) + [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | 277,230 | 86.54 |
+| [sentence-compression.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/sentence-compression.tsv.gz) | [sentence-compression](https://github.com/google-research-datasets/sentence-compression) | 180,000 | 84.36 |
+| [SimpleWiki.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/SimpleWiki.tsv.gz) | [SimpleWiki](https://cs.pomona.edu/~dkauchak/simplification/) | 102,225 | 84.26 |
+| [altlex.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/altlex.tsv.gz) | [altlex](https://github.com/chridey/altlex/) | 112,696 | 83.34 |
+| [msmarco-triplets.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/msmarco-triplets.tsv.gz) | [MS MARCO Passages](https://microsoft.github.io/msmarco/) | 5,028,051 | 83.12 |
+| [quora_duplicates.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/quora_duplicates.tsv.gz) | [Quora](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | 103,663 | 82.55 |
+| [coco_captions-with-guid.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/coco_captions-with-guid.tsv.gz) | [COCO](https://cocodataset.org/) | 828,395 | 82.25
+| [flickr30k_captions-with-guid.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/flickr30k_captions-with-guid.tsv.gz) | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | 317,695 | 82.04
+| [yahoo_answers_title_question.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/yahoo_answers_title_question.tsv.gz) | [Yahoo Answers Dataset](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) | 659,896 | 81.19 |
+| [S2ORC_citation_pairs.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/S2ORC_citation_pairs.tsv.gz) | [Semantic Scholar Open Research Corpus](http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/) | 52,603,982 | 81.02 |
+| [yahoo_answers_title_answer.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/yahoo_answerstitle_answer.tsv.gz) | [Yahoo Answers Dataset](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset)  | 1,198,260 | 80.25 
+| [stackexchange_duplicate_questions.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/stackexchange_duplicate_questions.tsv.gz) | [Stackexchange](https://stackexchange.com/) | 169,438 | 80.37
+| [yahoo_answers_question_answer.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/yahoo_answers_question_answer.tsv.gz) | [Yahoo Answers Dataset](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset)  | 681,164 | 79.88 |
+| [wiki-atomic-edits.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/wiki-atomic-edits.tsv.gz) | [wiki-atomic-edits](https://github.com/google-research-datasets/wiki-atomic-edits) |   22,980,185  | 79.58
+| [wiki-split.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/wiki-split.tsv.gz) | [wiki-split](https://github.com/google-research-datasets/wiki-split) | 929,944 | 76.59
+
+
+See the respective linked source website for the dataset license.
+
+
+All datasets have a sample per line and the individual sentences are separated by a tab (\t). Some datasets (like AllNLI) has three sentences per line: An anchor, a positive, and a hard negative.
+
+We measure for each dataset the performance on the STSb development dataset after 2k training steps with a distilroberta-base model and a batch size of 256. 
+
+**Note**: We find that the STSb dataset is a suboptimal dataset to evaluate the quality of sentence embedding models. It consists mainly of rather simple sentences, it does not require any domain specific knowledge, and the included sentences are of rather high quality compared to noisy, user-written content. Please do not infer from the above numbers how the approaches will perform on your domain specific dataset.
+
+## Training
+See [training.py](training.py) for the training script.
+
+The training script allows to load one or multiple files. We construct batches by sampling examples from the respective dataset. So far, examples are not mixed between the datasets, i.e., a batch consists only of examples from a single dataset.
+
+As the dataset sizes are quite different in size, we perform a temperature controlled sampling from the datasets: Smaller datasets are up-sampled, while larger datasets are down-sampled. This allows an effective training with very large and smaller datasets.
+
+## Pre-Trained Models
+Have a look at [pre-trained models](https://www.sbert.net/docs/pretrained_models.html) to view all models that were trained on these paraphrase datasets.
+
+- **paraphrase-MiniLM-L12-v2** - Trained on the following datasets: AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits
+- **paraphrase-distilroberta-base-v2** - Trained on the following datasets: AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits
+- **paraphrase-distilroberta-base-v1** - Trained on the following datasets: AllNLI, sentence-compression, SimpleWiki, altlex, quora_duplicates, wiki-atomic-edits, wiki-split
+- **paraphrase-xlm-r-multilingual-v1** - Multilingual version of paraphrase-distilroberta-base-v1, trained on parallel data for 50+ languages. (Teacher: paraphrase-distilroberta-base-v1, Student: xlm-r-base)
+
+
+## Work in Progress
+
+Training with this data is currently work-in-progress. Things that will be added in the next time:
+- **More datasets**: Are you aware of more suitable training datasets? Let me know: [info@nils-reimers.de](mailto:info@nils-reimers.de)
+- **Optimized batching**: Currently batches are only drawn from one dataset. Future work might include also batches that are sampled across datasets
+- **Optimized loss function**: Currently the same parameters of MultipleNegativesRankingLoss is used for all datasets. Future work includes testing if the dataset benefit from individual loss functions.
+- **Pre-trained models**: Once all datasets are collected, we will train and release respective models.
\ No newline at end of file
--- a/examples/training/paraphrases/training.py
+++ b/examples/training/paraphrases/training.py
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+from MultiDatasetDataLoader import MultiDatasetDataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = "distilroberta-base"
+num_epochs = 1
+sts_dataset_path = "data-eval/stsbenchmark.tsv.gz"
+batch_size_pairs = 384
+batch_size_triplets = 256
+max_seq_length = 128
+use_amp = True  # Set to False, if you use a CPU or your GPU does not support FP16 operations
+evaluation_steps = 500
+warmup_steps = 500
+
+#####
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Save path of the model
+model_save_path = (
+    "output/training_paraphrases_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+## SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+dataset_list = []
+for filepath in sys.argv[1:]:
+    dataset = []
+    with_guid = "with-guid" in filepath  # Some datasets have a guid in the first column
+
+    with gzip.open(filepath, "rt", encoding="utf8") as fIn:
+        for line in fIn:
+            splits = line.strip().split("\t")
+            if with_guid:
+                guid = splits[0]
+                texts = splits[1:]
+            else:
+                guid = None
+                texts = splits
+
+            dataset.append(InputExample(texts=texts, guid=guid))
+
+    dataset_list.append(dataset)
+
+
+train_dataloader = MultiDatasetDataLoader(
+    dataset_list, batch_size_pairs=batch_size_pairs, batch_size_triplets=batch_size_triplets
+)
+
+
+# Our training loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "dev":
+            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=evaluation_steps,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=use_amp,
+    checkpoint_path=model_save_path,
+    checkpoint_save_steps=1000,
+    checkpoint_save_total_limit=3,
+)
--- a/examples/training/quora_duplicate_questions/README.md
+++ b/examples/training/quora_duplicate_questions/README.md
+# Quora Duplicate Questions
+
+This folder contains scripts that demonstrate how to train SentenceTransformers for **Information Retrieval**. As simple example, we will use the [Quora Duplicate Questions dataset](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs). It contains over 500,000 sentences with over 400,000 pairwise annotations whether two questions are a duplicate or not.
+
+## Pretrained Models
+
+Currently the following models trained on Quora Duplicate Questions are available:
+* **distilbert-base-nli-stsb-quora-ranking**:  We extended the *distilbert-base-nli-stsb-mean-tokens* model and trained it with *OnlineContrastiveLoss* and with *MultipleNegativesRankingLoss* on the Quora Duplicate questions dataset. For the code, see [training_multi-task-learning.py](training_multi-task-learning.py)
+* **distilbert-multilingual-nli-stsb-quora-ranking**: Extension of *distilbert-base-nli-stsb-quora-ranking* to be multi-lingual. Trained on parallel data for 50 languages.
+
+You can load & use pre-trained models like this:
+```python
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("model_name")
+```
+
+
+## Dataset
+As dataset to train a **Duplicate Questions Semantic Search Engine** we use [Quora Duplicate Questions dataset](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs). The original format looks like this:
+```
+id	qid1	qid2	question1	question2	is_duplicate
+0	1	2	What is the step by step guide to invest in share market in india?	What is the step by step guide to invest in share market?	0
+1	3	4	What is the story of Kohinoor (Koh-i-Noor) Diamond?	What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?	0
+```
+
+As a first step, we process this file to create distinct train/dev/test splits for different tasks. We define the following tasks:
+- **Duplicate Questions Classification**: Given two questions, are these questions duplicates? This is the original task as defined by Quora, however, it is rather a unpractical task. How do we retrieve possible duplicates in a large corpus for a given question? Further, models performing well on this classification task do not necessarily perform well on the following two task.
+- **Duplicate Questions Mining**: Given a large set (like 100k) of questions, identify all question pairs that are duplicates.
+- **Duplicate Questions Information Retrieval**: Given a large corpus (350k+) of questions. For a new, unseen question, find the most related (i.e. duplicate) questions in this corpus.
+
+
+**Download**: You can download the finished dataset here: [quora-IR-dataset.zip](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip)
+
+For details on the creation of the dataset, see [create_splits.py](create_splits.py).
+
+
+## Usage
+
+### Duplicate Questions Mining
+
+Given a large set of sentences (in this case questions), identify all pairs that are duplicates. See [Paraphrase Mining](../../applications/paraphrase-mining/README.md) for an example how to use sentence transformers to mine for duplicate questions / paraphrases. This approach can be scaled to hundred thousands of sentences given you have enough memory.
+
+### Semantic Search
+
+The model can also be used for Information Retrieval / Semantic Search. Given a new question, search a large corpus of hundred thousands of questions for duplicate questions. Given you have enough memory, this approach works well to copora up in the Millions (depending on your real-time requirements).
+
+For an interactive example, see [Semantic Search](../../applications/semantic-search/README.md).
+
+
+## Training
+
+Choosing the right loss function is crucial for getting well working sentence embeddings. For the given task, two loss functions are especially suitable: **ConstrativeLoss** and **MultipleNegativesRankingLoss**
+
+### Constrative Loss
+For the complete example, see [training_OnlineContrastiveLoss.py](training_OnlineContrastiveLoss.py).
+
+In the original dataset, we have questions given with a label of 0=not duplicate and 1=duplicate. In that case, we can use contrastive loss: Similar pairs with label 1 are pulled together, so that they are close in vector space. Dissimilar pairs, that are closer than a defined margin, are pushed away in vector space.
+
+Choosing the distance function and especially choosing a sensible margin are quite important for the success of contrastive loss. In the given example, we use cosine_distance (which is 1-cosine_similarity) with a margin of 0.5. I.e., non-duplicate questions should have a cosine_distance of at least 0.5 (which is equivalent to a 0.5 cosine similarity difference).
+
+An improved version of contrastive loss is OnlineContrastiveLoss, which looks which negative pairs have a lower distance that the largest positive pair and which positive pairs have a higher distance than the lowest distance of negative pairs. I.e., this loss automatically detects the hard cases in a batch and computes the loss only for these cases.
+
+The loss can be used like this:
+```python
+train_samples = []
+with open(
+    os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8"
+) as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        sample = InputExample(
+            texts=[row["question1"], row["question2"]],
+            label=int(row["is_duplicate"]),
+        )
+        train_samples.append(sample)
+
+
+train_dataset = SentencesDataset(train_samples, model=model)
+train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
+``` 
+
+For each row in our train dataset, we create new InputExample objects and the two questions as texts and the is_duplicate as the label.
+
+
+
+## MultipleNegativesRankingLoss
+For the complete example, see [training_MultipleNegativesRankingLoss.py](training_MultipleNegativesRankingLoss.py).
+
+*MultipleNegativesRankingLoss* is especially suitable for Information Retrieval / Semantic Search. A nice advantage of *MultipleNegativesRankingLoss* is that it only requires positive pairs, i.e., we only need examples of duplicate questions.
+
+From all pairs, we sample a mini-batch *(a_1, b_1), ..., (a_n, b_n)* where *(a_i, b_i)* is a duplicate question.
+
+MultipleNegativesRankingLoss now uses all *b_j* with j != i as negative example for *(a_i, b_i)*. For example, for *a_1* we have given the options *(b_1, ..., b_n)* and we need to identify which is the correct duplicate question to *a_1*. We do this by computing the dot-product between the embedding of *a_1* and all *b*'s and softmax normalize it so that we get a probability distribution over *(b_1, ..., b_n)*. In the best case, the positive example *b_1* get a probability of close to 1 while all others get scores close to 0. We use negative log-likelihood to compute the loss.
+
+
+*MultipleNegativesRankingLoss* implements this idea in an efficient way so that the embeddings are re-used. With a batch-size of 64, we have 64 positive pairs and each positive pairs has 64-1 negative distractors. 
+
+
+Using the loss is easy and does not require tuning of any hyperparameters:
+```python
+train_samples = []
+with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["is_duplicate"] == "1":
+            train_samples.append(
+                InputExample(texts=[row["question1"], row["question2"]], label=1)
+            )
+            train_samples.append(
+                InputExample(texts=[row["question2"], row["question1"]], label=1)
+            )  # if A is a duplicate of B, then B is a duplicate of A
+
+
+# After reading the train_samples, we create a SentencesDataset and a DataLoader
+train_dataset = SentencesDataset(train_samples, model=model)
+train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+```
+
+We only use the positive examples. As 'is_duplicate' is a symmetric relation, we not only add (A, B) but also (B, A) to our training sample set.
+
+**Note 1:** Increasing the batch sizes usually yields better results, as the task gets harder. It is more difficult to identify the correct duplicate question out of a set of 100 questions than out of a set of only 10 questions. So it is advisable to set the training batch size as large as possible. I trained it with a batch size of 350 on 32 GB GPU memory.
+
+**Note 2:** MultipleNegativesRankingLoss only works if *(a_i, b_j)* with j != i is actually a negative, non-duplicate question pair. In few instances, this assumption is wrong. But in the majority of cases, if we sample two random questions, they are not duplicates. If your dataset cannot fulfil this property,  MultipleNegativesRankingLoss might not work well.
+
+### Multi-Task-Learning
+Contrastive Loss works well for pair classification, i.e., given two pairs, are these duplicates or not. It pushes negative pairs far away in vector space, so that the distinguishing between duplicate and non-duplicate pairs works good.
+
+MultipleNegativesRankingLoss on the other sides mainly reduces the distance between positive pairs out of large set of possible candidates. However, the distance between  non-duplicate questions is not so large, so that this loss does not work that well for pair classification.
+
+In [training_multi-task-learning.py](training_multi-task-learning.py) I demonstrate how we can train the network with both losses. The essential code is to define both losses and to pass it to the fit method.
+```python
+train_samples_MultipleNegativesRankingLoss = []
+train_samples_ContrastiveLoss = []
+
+with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        train_samples_ContrastiveLoss.append(
+            InputExample(
+                texts=[row["question1"], row["question2"]],
+                label=int(row["is_duplicate"]),
+            )
+        )
+        if row["is_duplicate"] == "1":
+            train_samples_MultipleNegativesRankingLoss.append(
+                InputExample(texts=[row["question1"], row["question2"]], label=1)
+            )
+            train_samples_MultipleNegativesRankingLoss.append(
+                InputExample(texts=[row["question2"], row["question1"]], label=1)
+            )  # if A is a duplicate of B, then B is a duplicate of A
+
+# Create data loader and loss for MultipleNegativesRankingLoss
+train_dataset_MultipleNegativesRankingLoss = SentencesDataset(
+    train_samples_MultipleNegativesRankingLoss, model=model
+)
+train_dataloader_MultipleNegativesRankingLoss = DataLoader(
+    train_dataset_MultipleNegativesRankingLoss,
+    shuffle=True,
+    batch_size=train_batch_size,
+)
+train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)
+
+
+# Create data loader and loss for OnlineContrastiveLoss
+train_dataset_ConstrativeLoss = SentencesDataset(
+    train_samples_ConstrativeLoss, model=model
+)
+train_dataloader_ConstrativeLoss = DataLoader(
+    train_dataset_ConstrativeLoss, shuffle=True, batch_size=train_batch_size
+)
+train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(
+    model=model, distance_metric=distance_metric, margin=margin
+)
+
+# .....
+# Train the model
+model.fit(
+    train_objectives=[
+        (train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss),
+        (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss),
+    ],
+    evaluator=seq_evaluator,
+    epochs=num_epochs,
+    warmup_steps=1000,
+    output_path=model_save_path,
+)
+```
+
--- a/examples/training/quora_duplicate_questions/application_duplicate_questions_mining.py
+++ b/examples/training/quora_duplicate_questions/application_duplicate_questions_mining.py
+"""
+This application demonstrates how to find duplicate questions (paraphrases) in a long
+list of sentences.
+"""
+
+from sentence_transformers import SentenceTransformer, util
+
+# Questions can be a long list of sentences up to 100k sentences or more.
+# For demonstration purposes, we limit it to a few questions which all have on duplicate
+questions = [
+    "How did you catch your spouse cheating?",
+    "How can I find out if my husband is cheating?",
+    "Is my wife cheating?",
+    "How do I know if my partner is cheating?",
+    "Why is Starbucks in India overrated?",
+    "Is Starbucks overrated in india?",
+    "How can I lose weight fast without exercise?",
+    "Can I lose weight without exercise?",
+    "Which city is the best in India? Why?",
+    "Which is the best city in India?",
+    "How can I stay focused in class?",
+    "How can I stay focused on my school work?",
+    "How can I Remotely hack a mobile phone?",
+    "How can I hack my phone?",
+    "Where should I stay in Goa?",
+    "Which are the best hotels in Goa?",
+    "Why does hair turn white?",
+    "What causes older peoples hair to turn grey?",
+    "What is the easiest way to get followers on Quora?",
+    "How do I get more followers for my Quora?",
+]
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Given a model and a List of strings (texts), evaluation.ParaphraseMiningEvaluator.paraphrase_mining performs a
+# mining task by computing cosine similarity between all possible combinations and returning the ones with the highest scores.
+# It returns a list of tuples (score, i, j) with i, j representing the index in the questions list.
+pairs = util.paraphrase_mining(model, questions)
+
+# Output Top-20 pairs:
+for score, qid1, qid2 in pairs[0:20]:
+    print("{:.3f}\t{}\t\t\t{}".format(score, questions[qid1], questions[qid2]))
--- a/examples/training/quora_duplicate_questions/create_splits.py
+++ b/examples/training/quora_duplicate_questions/create_splits.py
+"""
+The Quora Duplicate Questions dataset contains questions pairs from Quora (www.quora.com)
+along with a label whether the two questions are a duplicate, i.e., have an identical intention.
+
+Example of a duplicate pair:
+How do I enhance my English?  AND  How can I become good at English?
+
+Example of a non-duplicate pair:
+How are roads named?   AND    How are airport runways named?
+
+More details and the original Quora dataset can be found here:
+https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
+Dataset: http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
+
+You do not need to run this script. You can download all files from here:
+https://sbert.net/datasets/quora-duplicate-questions.zip
+
+This script does the following:
+1) After reading the quora_duplicate_questions.tsv, as provided by Quora, we add a transitive closure: If question (A, B) are duplicates and (B, C) are duplicates, than (A, C) must also be a duplicate. We add these missing links.
+
+2) Next, we split sentences into train, dev, and test with a ratio of about 85% / 5% / 10%. In contrast to must other Quora data splits, like the split provided by GLUE, we ensure that the three sets are overlap free, i.e., no sentences in dev / test will appear in the train dataset. In order to achieve three distinct datasets, we pick a sentence and then assign all duplicate sentences to this dataset to that respective set
+
+3) After distributing sentences to the three dataset split, we create files to facilitate 3 different tasks:
+    3.1) Classification - Given two sentences, are these a duplicate? This is identical to the original Quora task and the task in GLUE, but with the big difference that sentences in dev / test have not been seen in train.
+    3.2) Duplicate Question Mining - Given a large set of questions, identify all duplicates. The dev set consists of about 50k questions, the test set of about 100k sentences.
+    3.3) Information Retrieval - Given a question as query, find in a large corpus (~350k questions) the duplicates of the query question.
+
+
+The output consists of the following files:
+
+quora_duplicate_questions.tsv - Original file provided by Quora (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
+
+classification/
+    train/dev/test_pairs.tsv - Distinct sets of question pairs with label for duplicate / non-duplicate. These splits can be used for sentence pair classification tasks
+
+duplicate-mining/ - Given a large set of questions, find all duplicates.
+    _corpus.tsv - Large set of sentences
+    _duplicates.tsv - All duplicate questions in the respective corpus.tsv
+
+information-retrieval/  - Given a large corpus of questions, find the duplicates for a given query
+    corpus.tsv - This file will be used for train/dev/test. It contains all questions in the corpus
+    dev/test-queries.tsv - Queries and the respective duplicate questions (QIDs) in the corpus
+
+"""
+
+import csv
+from collections import defaultdict
+import random
+import os
+from sentence_transformers import util
+
+
+random.seed(42)
+
+# Get raw file
+source_file = "quora-IR-dataset/quora_duplicate_questions.tsv"
+os.makedirs("quora-IR-dataset", exist_ok=True)
+os.makedirs("quora-IR-dataset/graph", exist_ok=True)
+os.makedirs("quora-IR-dataset/information-retrieval", exist_ok=True)
+os.makedirs("quora-IR-dataset/classification", exist_ok=True)
+os.makedirs("quora-IR-dataset/duplicate-mining", exist_ok=True)
+
+if not os.path.exists(source_file):
+    print("Download file to", source_file)
+    util.http_get("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", source_file)
+
+# Read pairwise file
+sentences = {}
+duplicates = defaultdict(lambda: defaultdict(bool))
+rows = []
+with open(source_file, encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+    for row in reader:
+        id1 = row["qid1"]
+        id2 = row["qid2"]
+        question1 = row["question1"].replace("\r", "").replace("\n", " ").replace("\t", " ")
+        question2 = row["question2"].replace("\r", "").replace("\n", " ").replace("\t", " ")
+        is_duplicate = row["is_duplicate"]
+
+        if question1 == "" or question2 == "":
+            continue
+
+        sentences[id1] = question1
+        sentences[id2] = question2
+
+        rows.append(
+            {"qid1": id1, "qid2": id2, "question1": question1, "question2": question2, "is_duplicate": is_duplicate}
+        )
+
+        if is_duplicate == "1":
+            duplicates[id1][id2] = True
+            duplicates[id2][id1] = True
+
+
+# Search for (near) exact duplicates
+# The original Quora duplicate questions dataset is an incomplete annotation,
+# i.e., there are several duplicate question pairs which are not marked as duplicates.
+# These missing annotation can make it difficult to compare approaches.
+# Here we use a simple approach that searches for near identical questions, that only differ in maybe a stopword
+# We mark these found question pairs also as duplicate to increase the annotation coverage
+stopwords = set(
+    [
+        "a",
+        "about",
+        "above",
+        "after",
+        "again",
+        "against",
+        "ain",
+        "all",
+        "am",
+        "an",
+        "and",
+        "any",
+        "are",
+        "aren",
+        "aren't",
+        "as",
+        "at",
+        "be",
+        "because",
+        "been",
+        "before",
+        "being",
+        "below",
+        "between",
+        "both",
+        "but",
+        "by",
+        "can",
+        "couldn",
+        "couldn't",
+        "d",
+        "did",
+        "didn",
+        "didn't",
+        "do",
+        "does",
+        "doesn",
+        "doesn't",
+        "doing",
+        "don",
+        "don't",
+        "down",
+        "during",
+        "each",
+        "few",
+        "for",
+        "from",
+        "further",
+        "had",
+        "hadn",
+        "hadn't",
+        "has",
+        "hasn",
+        "hasn't",
+        "have",
+        "haven",
+        "haven't",
+        "having",
+        "he",
+        "her",
+        "here",
+        "hers",
+        "herself",
+        "him",
+        "himself",
+        "his",
+        "i",
+        "if",
+        "in",
+        "into",
+        "is",
+        "isn",
+        "isn't",
+        "it's",
+        "its",
+        "itself",
+        "just",
+        "ll",
+        "m",
+        "ma",
+        "me",
+        "mightn",
+        "mightn't",
+        "more",
+        "most",
+        "mustn",
+        "mustn't",
+        "my",
+        "myself",
+        "needn",
+        "needn't",
+        "no",
+        "nor",
+        "not",
+        "now",
+        "o",
+        "of",
+        "off",
+        "on",
+        "once",
+        "only",
+        "or",
+        "other",
+        "our",
+        "ours",
+        "ourselves",
+        "out",
+        "over",
+        "own",
+        "re",
+        "s",
+        "same",
+        "shan",
+        "shan't",
+        "she",
+        "she's",
+        "should",
+        "should've",
+        "shouldn",
+        "shouldn't",
+        "so",
+        "some",
+        "such",
+        "t",
+        "than",
+        "that",
+        "that'll",
+        "the",
+        "their",
+        "theirs",
+        "them",
+        "themselves",
+        "then",
+        "there",
+        "these",
+        "they",
+        "this",
+        "those",
+        "through",
+        "to",
+        "too",
+        "under",
+        "until",
+        "up",
+        "ve",
+        "very",
+        "was",
+        "wasn",
+        "wasn't",
+        "we",
+        "were",
+        "weren",
+        "weren't",
+        "which",
+        "while",
+        "will",
+        "with",
+        "won",
+        "won't",
+        "wouldn",
+        "wouldn't",
+        "y",
+        "you",
+        "you'd",
+        "you'll",
+        "you're",
+        "you've",
+        "your",
+        "yours",
+        "yourself",
+        "yourselves",
+    ]
+)
+
+num_new_duplicates = 0
+sentences_norm = {}
+
+for id, sent in sentences.items():
+    sent_norm = sent.lower()
+
+    # Replace some common paraphrases
+    sent_norm = sent_norm.replace("how do you", "how do i").replace("how do we", "how do i")
+    sent_norm = (
+        sent_norm.replace("how can we", "how can i")
+        .replace("how can you", "how can i")
+        .replace("how can i", "how do i")
+    )
+    sent_norm = sent_norm.replace("really true", "true")
+    sent_norm = sent_norm.replace("what are the importance", "what is the importance")
+    sent_norm = sent_norm.replace("what was", "what is")
+    sent_norm = sent_norm.replace("so many", "many")
+    sent_norm = sent_norm.replace("would it take", "will it take")
+
+    # Remove any punctuation characters
+    for c in [",", "!", ".", "?", "'", '"', ":", ";", "[", "]", "{", "}", "<", ">"]:
+        sent_norm = sent_norm.replace(c, " ")
+
+    # Remove stop words
+    tokens = sent_norm.split()
+    tokens = [token for token in tokens if token not in stopwords]
+    sent_norm = "".join(tokens)
+
+    if sent_norm in sentences_norm:
+        if not duplicates[id][sentences_norm[sent_norm]]:
+            num_new_duplicates += 1
+
+        duplicates[id][sentences_norm[sent_norm]] = True
+        duplicates[sentences_norm[sent_norm]][id] = True
+    else:
+        sentences_norm[sent_norm] = id
+
+
+print("(Nearly) exact duplicates found:", num_new_duplicates)
+
+
+# Add transitive closure (if a,b and b,c duplicates => a,c are duplicates)
+new_entries = True
+while new_entries:
+    print("Add transitive closure")
+    new_entries = False
+    for a in sentences:
+        for b in list(duplicates[a]):
+            for c in list(duplicates[b]):
+                if a != c and not duplicates[a][c]:
+                    new_entries = True
+                    duplicates[a][c] = True
+                    duplicates[c][a] = True
+
+
+# Distribute rows to train/dev/test split
+# Ensure that sets contain distinct sentences
+is_assigned = set()
+random.shuffle(rows)
+
+train_ids = set()
+dev_ids = set()
+test_ids = set()
+
+counter = 0
+for row in rows:
+    if row["qid1"] in is_assigned and row["qid2"] in is_assigned:
+        continue
+    elif row["qid1"] in is_assigned or row["qid2"] in is_assigned:
+        if row["qid2"] in is_assigned:  # Ensure that qid1 is assigned and qid2 not yet
+            row["qid1"], row["qid2"] = row["qid2"], row["qid1"]
+
+        # Move qid2 to the same split as qid1
+        target_set = train_ids
+        if row["qid1"] in dev_ids:
+            target_set = dev_ids
+        elif row["qid1"] in test_ids:
+            target_set = test_ids
+
+    else:
+        # Distribution about 85%/5%/10%
+        target_set = train_ids
+        if counter % 10 == 0:
+            target_set = dev_ids
+        elif counter % 10 == 1 or counter % 10 == 2:
+            target_set = test_ids
+        counter += 1
+
+    # Get the sentence with all duplicates and add it to the respective sets
+    target_set.add(row["qid1"])
+    is_assigned.add(row["qid1"])
+
+    target_set.add(row["qid2"])
+    is_assigned.add(row["qid2"])
+
+    for b in list(duplicates[row["qid1"]]) + list(duplicates[row["qid2"]]):
+        target_set.add(b)
+        is_assigned.add(b)
+
+
+# Assert all sets are mutually exclusive
+assert len(train_ids.intersection(dev_ids)) == 0
+assert len(train_ids.intersection(test_ids)) == 0
+assert len(test_ids.intersection(dev_ids)) == 0
+
+
+print("\nTrain sentences:", len(train_ids))
+print("Dev sentences:", len(dev_ids))
+print("Test sentences:", len(test_ids))
+
+
+# Extract the ids for duplicate questions for train/dev/test
+def get_duplicate_set(ids_set):
+    dups_set = set()
+    for a in ids_set:
+        for b in duplicates[a]:
+            ids = sorted([a, b])
+            dups_set.add(tuple(ids))
+    return dups_set
+
+
+train_duplicates = get_duplicate_set(train_ids)
+dev_duplicates = get_duplicate_set(dev_ids)
+test_duplicates = get_duplicate_set(test_ids)
+
+
+print("\nTrain duplicates", len(train_duplicates))
+print("Dev duplicates", len(dev_duplicates))
+print("Test duplicates", len(test_duplicates))
+
+############### Write general files about the duplicate questions graph ############
+with open("quora-IR-dataset/graph/sentences.tsv", "w", encoding="utf8") as fOut:
+    fOut.write("qid\tquestion\n")
+    for id, question in sentences.items():
+        fOut.write("{}\t{}\n".format(id, question))
+
+duplicates_list = set()
+for a in duplicates:
+    for b in duplicates[a]:
+        duplicates_list.add(tuple(sorted([int(a), int(b)])))
+
+
+duplicates_list = list(duplicates_list)
+duplicates_list = sorted(duplicates_list, key=lambda x: x[0] * 1000000 + x[1])
+
+
+print("\nWrite duplicate graph in pairwise format")
+with open("quora-IR-dataset/graph/duplicates-graph-pairwise.tsv", "w", encoding="utf8") as fOut:
+    fOut.write("qid1\tqid2\n")
+    for a, b in duplicates_list:
+        fOut.write("{}\t{}\n".format(a, b))
+
+
+print("Write duplicate graph in list format")
+with open("quora-IR-dataset/graph/duplicates-graph-list.tsv", "w", encoding="utf8") as fOut:
+    fOut.write("qid1\tqid2\n")
+    for a in sorted(duplicates.keys(), key=lambda x: int(x)):
+        if len(duplicates[a]) > 0:
+            fOut.write("{}\t{}\n".format(a, ",".join(sorted(duplicates[a]))))
+
+print("Write duplicate graph in connected subgraph format")
+with open("quora-IR-dataset/graph/duplicates-graph-connected-nodes.tsv", "w", encoding="utf8") as fOut:
+    written_qids = set()
+    fOut.write("qids\n")
+    for a in sorted(duplicates.keys(), key=lambda x: int(x)):
+        if a not in written_qids:
+            ids = set()
+            ids.add(a)
+
+            for b in duplicates[a]:
+                ids.add(b)
+
+            fOut.write("{}\n".format(",".join(sorted(ids, key=lambda x: int(x)))))
+            for id in ids:
+                written_qids.add(id)
+
+
+def write_qids(name, ids_list):
+    with open("quora-IR-dataset/graph/" + name + "-questions.tsv", "w", encoding="utf8") as fOut:
+        fOut.write("qid\n")
+        fOut.write("\n".join(sorted(ids_list, key=lambda x: int(x))))
+
+
+write_qids("train", train_ids)
+write_qids("dev", dev_ids)
+write_qids("test", test_ids)
+
+
+####### Output for duplicate mining #######
+def write_mining_files(name, ids, dups):
+    with open("quora-IR-dataset/duplicate-mining/" + name + "_corpus.tsv", "w", encoding="utf8") as fOut:
+        fOut.write("qid\tquestion\n")
+        for id in ids:
+            fOut.write("{}\t{}\n".format(id, sentences[id]))
+
+    with open("quora-IR-dataset/duplicate-mining/" + name + "_duplicates.tsv", "w", encoding="utf8") as fOut:
+        fOut.write("qid1\tqid2\n")
+        for a, b in dups:
+            fOut.write("{}\t{}\n".format(a, b))
+
+
+write_mining_files("train", train_ids, train_duplicates)
+write_mining_files("dev", dev_ids, dev_duplicates)
+write_mining_files("test", test_ids, test_duplicates)
+
+
+###### Classification dataset #####
+with open("quora-IR-dataset/classification/train_pairs.tsv", "w", encoding="utf8") as fOutTrain, open(
+    "quora-IR-dataset/classification/dev_pairs.tsv", "w", encoding="utf8"
+) as fOutDev, open("quora-IR-dataset/classification/test_pairs.tsv", "w", encoding="utf8") as fOutTest:
+    fOutTrain.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
+    fOutDev.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
+    fOutTest.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
+
+    for row in rows:
+        id1 = row["qid1"]
+        id2 = row["qid2"]
+
+        target = None
+        if id1 in train_ids and id2 in train_ids:
+            target = fOutTrain
+        elif id1 in dev_ids and id2 in dev_ids:
+            target = fOutDev
+        elif id1 in test_ids and id2 in test_ids:
+            target = fOutTest
+
+        if target is not None:
+            target.write("\t".join([row["qid1"], row["qid2"], sentences[id1], sentences[id2], row["is_duplicate"]]))
+            target.write("\n")
+
+
+####### Write files for Information Retrieval #####
+num_dev_queries = 5000
+num_test_queries = 10000
+
+corpus_ids = train_ids.copy()
+dev_queries = set()
+test_queries = set()
+
+# Create dev queries
+rnd_dev_ids = sorted(list(dev_ids))
+random.shuffle(rnd_dev_ids)
+
+for a in rnd_dev_ids:
+    if a not in corpus_ids:
+        if len(dev_queries) < num_dev_queries and len(duplicates[a]) > 0:
+            dev_queries.add(a)
+        else:
+            corpus_ids.add(a)
+
+        for b in duplicates[a]:
+            if b not in dev_queries:
+                corpus_ids.add(b)
+
+# Create test queries
+rnd_test_ids = sorted(list(test_ids))
+random.shuffle(rnd_test_ids)
+
+for a in rnd_test_ids:
+    if a not in corpus_ids:
+        if len(test_queries) < num_test_queries and len(duplicates[a]) > 0:
+            test_queries.add(a)
+        else:
+            corpus_ids.add(a)
+
+        for b in duplicates[a]:
+            if b not in test_queries:
+                corpus_ids.add(b)
+
+# Write output for information-retrieval
+print("\nInformation Retrieval Setup")
+print("Corpus size:", len(corpus_ids))
+print("Dev queries:", len(dev_queries))
+print("Test queries:", len(test_queries))
+
+with open("quora-IR-dataset/information-retrieval/corpus.tsv", "w", encoding="utf8") as fOut:
+    fOut.write("qid\tquestion\n")
+    for id in sorted(corpus_ids, key=lambda id: int(id)):
+        fOut.write("{}\t{}\n".format(id, sentences[id]))
+
+with open("quora-IR-dataset/information-retrieval/dev-queries.tsv", "w", encoding="utf8") as fOut:
+    fOut.write("qid\tquestion\tduplicate_qids\n")
+    for id in sorted(dev_queries, key=lambda id: int(id)):
+        fOut.write("{}\t{}\t{}\n".format(id, sentences[id], ",".join(duplicates[id])))
+
+with open("quora-IR-dataset/information-retrieval/test-queries.tsv", "w", encoding="utf8") as fOut:
+    fOut.write("qid\tquestion\tduplicate_qids\n")
+    for id in sorted(test_queries, key=lambda id: int(id)):
+        fOut.write("{}\t{}\t{}\n".format(id, sentences[id], ",".join(duplicates[id])))
+
+
+print("--DONE--")
--- a/examples/training/quora_duplicate_questions/training_MultipleNegativesRankingLoss.py
+++ b/examples/training/quora_duplicate_questions/training_MultipleNegativesRankingLoss.py
+"""
+This scripts demonstrates how to train a sentence embedding model for Information Retrieval.
+
+As dataset, we use Quora Duplicates Questions, where we have pairs of duplicate questions.
+
+As loss function, we use MultipleNegativesRankingLoss. Here, we only need positive pairs, i.e., pairs of sentences/texts that are considered to be relevant. Our dataset looks like this (a_1, b_1), (a_2, b_2), ... with a_i / b_i a text and (a_i, b_i) are relevant (e.g. are duplicates).
+
+MultipleNegativesRankingLoss takes a random subset of these, for example (a_1, b_1), ..., (a_n, b_n). a_i and b_i are considered to be relevant and should be close in vector space. All other b_j (for i != j) are negative examples and the distance between a_i and b_j should be maximized. Note: MultipleNegativesRankingLoss only works if a random b_j is likely not to be relevant for a_i. This is the case for our duplicate questions dataset: If a sample randomly b_j, it is unlikely to be a duplicate of a_i.
+
+
+The model we get works well for duplicate questions mining and for duplicate questions information retrieval. For question pair classification, other losses (like OnlineConstrativeLoss) work better.
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import csv
+import os
+from zipfile import ZipFile
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+
+# As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
+model = SentenceTransformer("stsb-distilbert-base")
+
+# Training for multiple epochs can be beneficial, as in each epoch a mini-batch is sampled differently
+# hence, we get different negatives for each positive
+num_epochs = 10
+
+# Increasing the batch size improves the performance for MultipleNegativesRankingLoss. Choose it as large as possible
+# I achieved the good results with a batch size of 300-350 (requires about 30 GB of GPU memory)
+train_batch_size = 64
+
+dataset_path = "quora-IR-dataset"
+model_save_path = "output/training_MultipleNegativesRankingLoss-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+os.makedirs(model_save_path, exist_ok=True)
+
+# Check if the dataset exists. If not, download and extract
+if not os.path.exists(dataset_path):
+    logger.info("Dataset not found. Download")
+    zip_save_path = "quora-IR-dataset.zip"
+    util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
+    with ZipFile(zip_save_path, "r") as zip:
+        zip.extractall(dataset_path)
+
+
+######### Read train data  ##########
+train_samples = []
+with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["is_duplicate"] == "1":
+            train_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=1))
+            train_samples.append(
+                InputExample(texts=[row["question2"], row["question1"]], label=1)
+            )  # if A is a duplicate of B, then B is a duplicate of A
+
+
+# After reading the train_samples, we create a DataLoader
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+
+################### Development  Evaluators ##################
+# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
+# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
+evaluators = []
+
+###### Classification ######
+# Given (quesiton1, question2), is this a duplicate or not?
+# The evaluator will compute the embeddings for both questions and then compute
+# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
+dev_sentences1 = []
+dev_sentences2 = []
+dev_labels = []
+with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences1.append(row["question1"])
+        dev_sentences2.append(row["question2"])
+        dev_labels.append(int(row["is_duplicate"]))
+
+
+binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
+evaluators.append(binary_acc_evaluator)
+
+
+###### Duplicate Questions Mining ######
+# Given a large corpus of questions, identify all duplicates in that corpus.
+
+# For faster processing, we limit the development corpus to only 10,000 sentences.
+max_dev_samples = 10000
+dev_sentences = {}
+dev_duplicates = []
+with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences[row["qid"]] = row["question"]
+
+        if len(dev_sentences) >= max_dev_samples:
+            break
+
+with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["qid1"] in dev_sentences and row["qid2"] in dev_sentences:
+            dev_duplicates.append([row["qid1"], row["qid2"]])
+
+
+# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
+# extracts a list with the pairs that have the highest similarity. Given the duplicate
+# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
+paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(dev_sentences, dev_duplicates, name="dev")
+evaluators.append(paraphrase_mining_evaluator)
+
+
+###### Duplicate Questions Information Retrieval ######
+# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
+# in that corpus.
+
+# For faster processing, we limit the development corpus to only 10,000 sentences.
+max_corpus_size = 10000
+
+ir_queries = {}  # Our queries (qid => question)
+ir_needed_qids = set()  # QIDs we need in the corpus
+ir_corpus = {}  # Our corpus (qid => question)
+ir_relevant_docs = {}  # Mapping of relevant documents for a given query (qid => set([relevant_question_ids])
+
+with open(os.path.join(dataset_path, "information-retrieval/dev-queries.tsv"), encoding="utf8") as fIn:
+    next(fIn)  # Skip header
+    for line in fIn:
+        qid, query, duplicate_ids = line.strip().split("\t")
+        duplicate_ids = duplicate_ids.split(",")
+        ir_queries[qid] = query
+        ir_relevant_docs[qid] = set(duplicate_ids)
+
+        for qid in duplicate_ids:
+            ir_needed_qids.add(qid)
+
+# First get all needed relevant documents (i.e., we must ensure, that the relevant questions are actually in the corpus
+distraction_questions = {}
+with open(os.path.join(dataset_path, "information-retrieval/corpus.tsv"), encoding="utf8") as fIn:
+    next(fIn)  # Skip header
+    for line in fIn:
+        qid, question = line.strip().split("\t")
+
+        if qid in ir_needed_qids:
+            ir_corpus[qid] = question
+        else:
+            distraction_questions[qid] = question
+
+# Now, also add some irrelevant questions to fill our corpus
+other_qid_list = list(distraction_questions.keys())
+random.shuffle(other_qid_list)
+
+for qid in other_qid_list[0 : max(0, max_corpus_size - len(ir_corpus))]:
+    ir_corpus[qid] = distraction_questions[qid]
+
+# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
+# metrices. For our use case MRR@k and Accuracy@k are relevant.
+ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)
+
+evaluators.append(ir_evaluator)
+
+# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
+# We optimize the model with respect to the score from the last evaluator (scores[-1])
+seq_evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
+
+
+logger.info("Evaluate model without training")
+seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=seq_evaluator,
+    epochs=num_epochs,
+    warmup_steps=1000,
+    output_path=model_save_path,
+)
--- a/examples/training/quora_duplicate_questions/training_OnlineContrastiveLoss.py
+++ b/examples/training/quora_duplicate_questions/training_OnlineContrastiveLoss.py
+"""
+This scripts demonstrates how to train a sentence embedding model for question pair classification
+with cosine-similarity and a simple threshold.
+
+As dataset, we use Quora Duplicates Questions, where we have labeled pairs of questions being either duplicates (label 1) or non-duplicate (label 0).
+
+As loss function, we use OnlineConstrativeLoss. It reduces the distance between positive pairs, i.e., it pulls the embeddings of positive pairs closer together. For negative pairs, it pushes them further apart.
+
+An issue with constrative loss is, that it might push sentences away that are already well positioned in vector space.
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import csv
+import os
+from zipfile import ZipFile
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+
+# As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
+model = SentenceTransformer("stsb-distilbert-base")
+num_epochs = 10
+train_batch_size = 64
+
+# As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
+distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
+
+# Negative pairs should have a distance of at least 0.5
+margin = 0.5
+
+dataset_path = "quora-IR-dataset"
+model_save_path = "output/training_OnlineConstrativeLoss-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+os.makedirs(model_save_path, exist_ok=True)
+
+# Check if the dataset exists. If not, download and extract
+if not os.path.exists(dataset_path):
+    logger.info("Dataset not found. Download")
+    zip_save_path = "quora-IR-dataset.zip"
+    util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
+    with ZipFile(zip_save_path, "r") as zip:
+        zip.extractall(dataset_path)
+
+
+######### Read train data  ##########
+# Read train data
+train_samples = []
+with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        sample = InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"]))
+        train_samples.append(sample)
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
+
+
+################### Development  Evaluators ##################
+# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
+# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
+evaluators = []
+
+###### Classification ######
+# Given (quesiton1, question2), is this a duplicate or not?
+# The evaluator will compute the embeddings for both questions and then compute
+# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
+dev_sentences1 = []
+dev_sentences2 = []
+dev_labels = []
+with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences1.append(row["question1"])
+        dev_sentences2.append(row["question2"])
+        dev_labels.append(int(row["is_duplicate"]))
+
+
+binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
+evaluators.append(binary_acc_evaluator)
+
+
+###### Duplicate Questions Mining ######
+# Given a large corpus of questions, identify all duplicates in that corpus.
+
+# For faster processing, we limit the development corpus to only 10,000 sentences.
+max_dev_samples = 10000
+dev_sentences = {}
+dev_duplicates = []
+with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences[row["qid"]] = row["question"]
+
+        if len(dev_sentences) >= max_dev_samples:
+            break
+
+with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["qid1"] in dev_sentences and row["qid2"] in dev_sentences:
+            dev_duplicates.append([row["qid1"], row["qid2"]])
+
+
+# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
+# extracts a list with the pairs that have the highest similarity. Given the duplicate
+# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
+paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(dev_sentences, dev_duplicates, name="dev")
+evaluators.append(paraphrase_mining_evaluator)
+
+
+###### Duplicate Questions Information Retrieval ######
+# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
+# in that corpus.
+
+# For faster processing, we limit the development corpus to only 10,000 sentences.
+max_corpus_size = 100000
+
+ir_queries = {}  # Our queries (qid => question)
+ir_needed_qids = set()  # QIDs we need in the corpus
+ir_corpus = {}  # Our corpus (qid => question)
+ir_relevant_docs = {}  # Mapping of relevant documents for a given query (qid => set([relevant_question_ids])
+
+with open(os.path.join(dataset_path, "information-retrieval/dev-queries.tsv"), encoding="utf8") as fIn:
+    next(fIn)  # Skip header
+    for line in fIn:
+        qid, query, duplicate_ids = line.strip().split("\t")
+        duplicate_ids = duplicate_ids.split(",")
+        ir_queries[qid] = query
+        ir_relevant_docs[qid] = set(duplicate_ids)
+
+        for qid in duplicate_ids:
+            ir_needed_qids.add(qid)
+
+# First get all needed relevant documents (i.e., we must ensure, that the relevant questions are actually in the corpus
+distraction_questions = {}
+with open(os.path.join(dataset_path, "information-retrieval/corpus.tsv"), encoding="utf8") as fIn:
+    next(fIn)  # Skip header
+    for line in fIn:
+        qid, question = line.strip().split("\t")
+
+        if qid in ir_needed_qids:
+            ir_corpus[qid] = question
+        else:
+            distraction_questions[qid] = question
+
+# Now, also add some irrelevant questions to fill our corpus
+other_qid_list = list(distraction_questions.keys())
+random.shuffle(other_qid_list)
+
+for qid in other_qid_list[0 : max(0, max_corpus_size - len(ir_corpus))]:
+    ir_corpus[qid] = distraction_questions[qid]
+
+# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
+# metrices. For our use case MRR@k and Accuracy@k are relevant.
+ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)
+
+evaluators.append(ir_evaluator)
+
+# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
+# We optimize the model with respect to the score from the last evaluator (scores[-1])
+seq_evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
+
+
+logger.info("Evaluate model without training")
+seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=seq_evaluator,
+    epochs=num_epochs,
+    warmup_steps=1000,
+    output_path=model_save_path,
+)
--- a/examples/training/quora_duplicate_questions/training_multi-task-learning.py
+++ b/examples/training/quora_duplicate_questions/training_multi-task-learning.py
+"""
+This script combines training_OnlineContrastiveLoss.py with training_MultipleNegativesRankingLoss.py
+
+Online constrative loss works well for classification (are question1 and question2 duplicates?), but it
+performs less well for duplicate questions mining. MultipleNegativesRankingLoss works well for duplicate
+questions mining, but it has some issues with classification as it does not push dissimilar pairs away.
+
+This script combines both losses to get the best of both worlds.
+
+Multi task learning is achieved quite easily by calling the model.fit method like this:
+model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_constrative_loss, train_loss_constrative_loss)] ...)
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import csv
+import os
+from zipfile import ZipFile
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+#### /print debug information to stdout
+
+
+# As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
+model = SentenceTransformer("stsb-distilbert-base")
+
+# Training for multiple epochs can be beneficial, as in each epoch a mini-batch is sampled differently
+# hence, we get different negatives for each positive
+num_epochs = 10
+
+# Increasing the batch size improves the performance for MultipleNegativesRankingLoss. Choose it as large as possible
+# I achieved the good results with a batch size of 300-350 (requires about 30 GB of GPU memory)
+train_batch_size = 64
+
+# As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
+distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
+
+# Negative pairs should have a distance of at least 0.5
+margin = 0.5
+
+dataset_path = "quora-IR-dataset"
+model_save_path = "output/training_multi-task-learning" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+os.makedirs(model_save_path, exist_ok=True)
+
+# Check if the dataset exists. If not, download and extract
+if not os.path.exists(dataset_path):
+    logger.info("Dataset not found. Download")
+    zip_save_path = "quora-IR-dataset.zip"
+    util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
+    with ZipFile(zip_save_path, "r") as zip:
+        zip.extractall(dataset_path)
+
+
+######### Read train data  ##########
+train_samples_MultipleNegativesRankingLoss = []
+train_samples_ConstrativeLoss = []
+
+with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        train_samples_ConstrativeLoss.append(
+            InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"]))
+        )
+        if row["is_duplicate"] == "1":
+            train_samples_MultipleNegativesRankingLoss.append(
+                InputExample(texts=[row["question1"], row["question2"]], label=1)
+            )
+            train_samples_MultipleNegativesRankingLoss.append(
+                InputExample(texts=[row["question2"], row["question1"]], label=1)
+            )  # if A is a duplicate of B, then B is a duplicate of A
+
+# Create data loader and loss for MultipleNegativesRankingLoss
+train_dataloader_MultipleNegativesRankingLoss = DataLoader(
+    train_samples_MultipleNegativesRankingLoss, shuffle=True, batch_size=train_batch_size
+)
+train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)
+
+
+# Create data loader and loss for OnlineContrastiveLoss
+train_dataloader_ConstrativeLoss = DataLoader(train_samples_ConstrativeLoss, shuffle=True, batch_size=train_batch_size)
+train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
+
+
+################### Development  Evaluators ##################
+# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
+# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
+evaluators = []
+
+###### Classification ######
+# Given (quesiton1, question2), is this a duplicate or not?
+# The evaluator will compute the embeddings for both questions and then compute
+# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
+dev_sentences1 = []
+dev_sentences2 = []
+dev_labels = []
+with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences1.append(row["question1"])
+        dev_sentences2.append(row["question2"])
+        dev_labels.append(int(row["is_duplicate"]))
+
+
+binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
+evaluators.append(binary_acc_evaluator)
+
+
+###### Duplicate Questions Mining ######
+# Given a large corpus of questions, identify all duplicates in that corpus.
+
+# For faster processing, we limit the development corpus to only 10,000 sentences.
+max_dev_samples = 10000
+dev_sentences = {}
+dev_duplicates = []
+with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        dev_sentences[row["qid"]] = row["question"]
+
+        if len(dev_sentences) >= max_dev_samples:
+            break
+
+with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"), encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["qid1"] in dev_sentences and row["qid2"] in dev_sentences:
+            dev_duplicates.append([row["qid1"], row["qid2"]])
+
+
+# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
+# extracts a list with the pairs that have the highest similarity. Given the duplicate
+# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
+paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(dev_sentences, dev_duplicates, name="dev")
+evaluators.append(paraphrase_mining_evaluator)
+
+
+###### Duplicate Questions Information Retrieval ######
+# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
+# in that corpus.
+
+# For faster processing, we limit the development corpus to only 10,000 sentences.
+max_corpus_size = 100000
+
+ir_queries = {}  # Our queries (qid => question)
+ir_needed_qids = set()  # QIDs we need in the corpus
+ir_corpus = {}  # Our corpus (qid => question)
+ir_relevant_docs = {}  # Mapping of relevant documents for a given query (qid => set([relevant_question_ids])
+
+with open(os.path.join(dataset_path, "information-retrieval/dev-queries.tsv"), encoding="utf8") as fIn:
+    next(fIn)  # Skip header
+    for line in fIn:
+        qid, query, duplicate_ids = line.strip().split("\t")
+        duplicate_ids = duplicate_ids.split(",")
+        ir_queries[qid] = query
+        ir_relevant_docs[qid] = set(duplicate_ids)
+
+        for qid in duplicate_ids:
+            ir_needed_qids.add(qid)
+
+# First get all needed relevant documents (i.e., we must ensure, that the relevant questions are actually in the corpus
+distraction_questions = {}
+with open(os.path.join(dataset_path, "information-retrieval/corpus.tsv"), encoding="utf8") as fIn:
+    next(fIn)  # Skip header
+    for line in fIn:
+        qid, question = line.strip().split("\t")
+
+        if qid in ir_needed_qids:
+            ir_corpus[qid] = question
+        else:
+            distraction_questions[qid] = question
+
+# Now, also add some irrelevant questions to fill our corpus
+other_qid_list = list(distraction_questions.keys())
+random.shuffle(other_qid_list)
+
+for qid in other_qid_list[0 : max(0, max_corpus_size - len(ir_corpus))]:
+    ir_corpus[qid] = distraction_questions[qid]
+
+# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
+# metrices. For our use case MRR@k and Accuracy@k are relevant.
+ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)
+
+evaluators.append(ir_evaluator)
+
+# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
+# We optimize the model with respect to the score from the last evaluator (scores[-1])
+seq_evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
+
+
+logger.info("Evaluate model without training")
+seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)
+
+
+# Train the model
+model.fit(
+    train_objectives=[
+        (train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss),
+        (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss),
+    ],
+    evaluator=seq_evaluator,
+    epochs=num_epochs,
+    warmup_steps=1000,
+    output_path=model_save_path,
+)
--- a/examples/training/sts/README.md
+++ b/examples/training/sts/README.md
+# Semantic Textual Similarity
+
+Semantic Textual Similarity (STS) assigns a score on the similarity of two texts. In this example, we use the [STSbenchmark](https://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) as training data to fine-tune our network. See the following example scripts how to tune SentenceTransformer on STS data:
+
+- **[training_stsbenchmark.py](training_stsbenchmark.py)** - This example shows how to create a SentenceTransformer model from scratch by using a pre-trained transformer model together with a pooling layer.
+ - **[training_stsbenchmark_continue_training.py](training_stsbenchmark_continue_training.py)** - This example shows how to continue training on STS data for a previously created & trained SentenceTransformer model. In that example, we load a model trained on [NLI data](../nli/README.md).
+ 
+
+## Training data
+In STS, we have sentence pairs annotated together with a score indicating the similarity. For the [STSbenchmark](https://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark), the scores ranges from 0 (the content of the two sentences are competely different) up to 5 (the two sentences are identical in terms of their meaning). To train our network, we need to normalize these scores to a range of 0-1. This can simply be done by dividing the score by 5.
+
+To store our training data, we create a list with `InputExample` objects. Each `InputExample` contains the sentence pair together with the label (score) that ranges between 0 - 1. A simplified version how the training data has to look like is the following:
+
+```python
+from sentence_transformers import (
+    SentenceTransformer,
+    SentencesDataset,
+    InputExample,
+    losses,
+)
+
+model = SentenceTransformer("nli-distilroberta-base-v2")
+train_examples = [
+    InputExample(texts=["My first sentence", "My second sentence"], label=0.8),
+    InputExample(texts=["Another pair", "Unrelated sentence"], label=0.3),
+]
+train_dataset = SentencesDataset(train_examples, model)
+```
+
+## Loss Function
+As loss function we use [CosineSimilarityLoss](../../../docs/package_reference/losses.html#cosinesimilarityloss).
+
+
+*CosineSimilarityLoss* trains the network with a siamese network structure (for details see: [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084))
+
+
+![SBERT Siamese Network Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_Siamese_Network.png "SBERT Siamese Architecture")
+
+
+For each sentence pair, we pass sentence A and sentence B through our network which yields the embeddings *u* und *v*. The similarity of these embeddings is computed using cosine similarity and the result is compared to the gold similarity score. This allows our network to be fine-tuned and to recognize the similarity of sentences. 
+
+This training in a siamese network structure is done automatically when we use CosineSimilarityLoss.
--- a/examples/training/sts/training_stsbenchmark.py
+++ b/examples/training/sts/training_stsbenchmark.py
+"""
+This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
+that can be compared using cosine-similarity to measure the similarity.
+
+Usage:
+python training_nli.py
+
+OR
+python training_nli.py pretrained_transformer_model_name
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
+
+# Read the dataset
+train_batch_size = 16
+num_epochs = 4
+model_save_path = (
+    "output/training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training. We skip evaluation in this example
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
--- a/examples/training/sts/training_stsbenchmark_continue_training.py
+++ b/examples/training/sts/training_stsbenchmark_continue_training.py
+"""
+This example loads the pre-trained SentenceTransformer model 'nli-distilroberta-base-v2' from the server.
+It then fine-tunes this model for some epochs on the STS benchmark dataset.
+
+Note: In this example, you must specify a SentenceTransformer model.
+If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# Read the dataset
+model_name = "nli-distilroberta-base-v2"
+train_batch_size = 16
+num_epochs = 4
+model_save_path = (
+    "output/training_stsbenchmark_continue_training-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Load a pre-trained sentence transformer model
+model = SentenceTransformer(model_name)
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+
+# Development set: Measure correlation between cosine score and gold labels
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training. We skip evaluation in this example
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
--- a/examples/unsupervised_learning/CT/README.md
+++ b/examples/unsupervised_learning/CT/README.md
+# CT
+Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF)  ([Github](https://github.com/FreddeFrallan/Contrastive-Tension)) an unsupervised learning approach for sentence embeddings that just requires sentences.
+
+## Background
+During training, CT builds two independent encoders ('Model1' and 'Model2') with initial parameters shared to encode a pair of sentences. If Model1 and Model2 encode the same sentence, then the dot-product of the two sentence embeddings should be large. If Model1 and Model2 encode different sentences, then their dot-product should be small.
+
+
+The original CT paper uses batchs that contain multiple mini-batches. For the example of K=7,  each mini-batch consists of sentence pairs (S_A, S_A), (S_A, S_B), (S_A, S_C), ..., (S_A, S_H) and the corresponding labels are 1, 0, 0, ..., 0. In other words, one identical pair of sentences is viewed as the positive example and other pairs of different sentences are viewed as the negative examples (i.e. 1 positive + K negative pairs). The training objective is the binary cross-entropy between the generated similarity scores and labels. This example is illustrated in the figure (from the Appendix A.1 of the CT paper) below:
+
+![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
+
+After training, the model 2 will be used for inference, which usually has better performance.
+
+In **[CT_Improved](../CT_In-Batch_Negatives/README.md)** we propose an improvement to CT by using in-batch negative sampling.
+
+## Performance
+In some preliminary experiments, we compate performance on the STSbenchmark dataset (trained with 1 million sentences from Wikipedia) and on the paraphrase mining task for the Quora duplicate questions dataset (trained with questions from Quora).
+
+| Method | STSb (Spearman) | Quora-Duplicate-Question (Avg. Precision) |
+| --- | :---: | :---:
+| CT | 75.7 | 36.5
+| CT-Improved | 78.5 | 40.1
+
+Note: We used the code provided in this repository, not the official code from the authors.
+
+## CT from Sentences File
+
+**[train_ct_from_file.py](train_ct_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+
+
+## Further Training Examples 
+
+- **[train_stsb_ct.py](train_stsb_ct.py)**: This example uses 1 million sentences from Wikipedia to train with CT. It evaluate the performance on the  [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
+- **[train_askubuntu_ct.py](train_askubuntu_ct.py)**: This example trains on [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu), a dataset with questions from the AskUbuntu Stackexchange forum.
+
+
+**Note:**
+This is a re-implementation of CT within sentence-transformers. For the official CT code, see: [FreddeFrallan/Contrastive-Tension](https://github.com/FreddeFrallan/Contrastive-Tension)
\ No newline at end of file
--- a/examples/unsupervised_learning/CT/train_askubuntu_ct.py
+++ b/examples/unsupervised_learning/CT/train_askubuntu_ct.py
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, util, evaluation, losses
+import logging
+import os
+import gzip
+from datetime import datetime
+import torch
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Some training parameters. We use a batch size of 16, for every positive example we include 8-1=7 negative examples
+# Sentences are truncated to 75 word pieces
+model_name = "distilbert-base-uncased"
+batch_size = 16
+pos_neg_ratio = 8  # batch_size must be devisible by pos_neg_ratio
+max_seq_length = 75
+num_epochs = 1
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "askubuntu"
+output_path = "output/train_askubuntu_ct-{}-{}-{}".format(
+    model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(sentence)
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train the model #################
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = losses.ContrastiveTensionDataLoader(
+    train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio
+)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLoss(model)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+
+
+logging.info("Start training")
+
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    weight_decay=0,
+    warmup_steps=0,
+    optimizer_class=torch.optim.RMSprop,
+    optimizer_params={"lr": 1e-5},
+    use_amp=False,  # Set to True, if your GPU has optimized FP16 cores
+)
+
+latest_output_path = output_path + "-latest"
+model.save(latest_output_path)
+
+### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
+model = SentenceTransformer(latest_output_path)
+test_evaluator(model)