First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/examples/unsupervised_learning/CT/train_ct_from_file.py
+++ b/examples/unsupervised_learning/CT/train_ct_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+CT will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_ct_from_file.py path/to/sentences.txt
+
+"""
+
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer
+import logging
+from datetime import datetime
+import gzip
+import sys
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 16
+pos_neg_ratio = 8  # batch_size must be devisible by pos_neg_ratio
+num_epochs = 1
+max_seq_length = 75
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_ct{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Read the train corpus  #################
+train_sentences = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+
+logging.info("Train sentences: {}".format(len(train_sentences)))
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = losses.ContrastiveTensionDataLoader(
+    train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio
+)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLoss(model)
+
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    optimizer_params={"lr": 5e-5},
+    checkpoint_path=model_output_path,
+    show_progress_bar=True,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/CT/train_stsb_ct.py
+++ b/examples/unsupervised_learning/CT/train_stsb_ct.py
+import torch
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, InputExample
+from sentence_transformers import losses
+import os
+import gzip
+import csv
+from datetime import datetime
+import logging
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 16
+pos_neg_ratio = 8  # batch_size must be devisible by pos_neg_ratio
+epochs = 1
+max_seq_length = 75
+
+# Save path to store our model
+model_save_path = "output/train_stsb_ct-{}-{}".format(model_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+################# Train sentences #################
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_sentences are simply your list of sentences
+train_sentences = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+################# Download and load STSb #################
+data_folder = "data/stsbenchmark"
+sts_dataset_path = f"{data_folder}/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+
+################# Initialize an SBERT model #################
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = losses.ContrastiveTensionDataLoader(
+    train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio
+)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLoss(model)
+
+
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=1,
+    evaluation_steps=1000,
+    weight_decay=0,
+    warmup_steps=0,
+    optimizer_class=torch.optim.RMSprop,
+    optimizer_params={"lr": 1e-5},
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU has optimized FP16 cores
+)
+
+########### Load the model and evaluate on test set
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/README.md
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/README.md
+# CT (In-Batch Negatives)
+Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF) an unsupervised learning approach for sentence embeddings that just requires sentences.
+
+## Background
+During training, CT builds two independent encoders ('Model1' and 'Model2') with initial parameters shared to encode a pair of sentences. If Model1 and Model2 encode the same sentence, then the dot-product of the two sentence embeddings should be large. If Model1 and Model2 encode different sentences, then their dot-product should be small.
+
+In the original CT paper, specially created batches are used. We implemented an improved version that uses in-batch negative sampling: Model1 and Model2 both encode the same set of sentences. We maximize the scores for matching indexes (i.e. Model1(S_i) and Model2(S_i)) while we minimize the scores for different indexes (i.e. Model1(S_i) and Model2(S_j) for i != j).
+
+Using in-batch negative sampling gives a stronger training signal than the original loss function proposed by Carlsson et al.
+
+
+![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
+
+After training, the model 2 will be used for inference, which usually has better performance.
+
+## Performance
+In some preliminary experiments, we compare performance on the STSbenchmark dataset (trained with 1 million sentences from Wikipedia) and on the Quora duplicate questions dataset (trained with questions from Quora).
+
+| Method | STSb (Spearman) | Quora-Duplicate-Question (Avg. Precision) |
+| --- | :---: | :---:
+| CT | 75.7 | 36.5
+| CT (In-Batch Negatives) | 78.5 | 40.1
+
+
+Note: We used the code provided in this repository, not the official code from the authors.
+
+## CT from Sentences File
+
+**[train_ct-improved_from_file.py](train_ct-improved_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+
+
+## Further Training Examples 
+
+- **[train_stsb_ct-improved.py](train_stsb_ct-improved.py)**: This example uses 1 million sentences from Wikipedia to train with CT. It evaluate the performance on the  [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
+- **[train_askubuntu_ct-improved.py](train_askubuntu_ct-improved.py)**: This example trains on [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu), a dataset with questions from the AskUbuntu Stackexchange forum.
\ No newline at end of file
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_askubuntu_ct-improved.py
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_askubuntu_ct-improved.py
+from sentence_transformers import SentenceTransformer, LoggingHandler, InputExample
+from sentence_transformers import models, util, evaluation, losses
+import logging
+import os
+import gzip
+from datetime import datetime
+from torch.utils.data import DataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Some training parameters. We use a batch size of 16, for every positive example we include 8-1=7 negative examples
+# Sentences are truncated to 75 word pieces
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 128
+epochs = 1
+max_seq_length = 75
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "askubuntu"
+output_path = "output/train_askubuntu_ct-improved-{}-{}-{}".format(
+    model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(InputExample(texts=[sentence, sentence]))
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train the model #################
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+
+
+logging.info("Start training")
+
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    warmup_steps=100,
+    use_amp=True,  # Set to True, if your GPU has optimized FP16 cores
+)
+
+latest_output_path = output_path + "-latest"
+model.save(latest_output_path)
+
+### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
+model = SentenceTransformer(latest_output_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+CT will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_ct_from_file.py path/to/sentences.txt
+
+"""
+
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer
+import logging
+from datetime import datetime
+import gzip
+import sys
+import tqdm
+from torch.utils.data import DataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 128
+num_epochs = 1
+max_seq_length = 75
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_ct-improved{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Read the train corpus  #################
+train_sentences = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+
+logging.info("Train sentences: {}".format(len(train_sentences)))
+
+# A regular torch DataLoader and as loss we use losses.ContrastiveTensionLossInBatchNegatives
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
+
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    optimizer_params={"lr": 5e-5},
+    checkpoint_path=model_output_path,
+    show_progress_bar=True,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_stsb_ct-improved.py
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_stsb_ct-improved.py
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, InputExample
+from sentence_transformers import losses
+import os
+import gzip
+import csv
+from datetime import datetime
+import logging
+from torch.utils.data import DataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 128
+epochs = 1
+max_seq_length = 75
+
+# Save path to store our model
+model_save_path = "output/training_stsb_ct-improved-{}-{}".format(
+    model_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+################# Train sentences #################
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_sentences are simply your list of sentences
+train_sentences = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        train_sentences.append(InputExample(texts=[line.strip(), line.strip()]))
+
+################# Download and load STSb #################
+data_folder = "data/stsbenchmark"
+sts_dataset_path = f"{data_folder}/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+
+################# Initialize an SBERT model #################
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLossInBatchNegatives(model, scale=1, similarity_fct=util.dot_score)
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=1,
+    evaluation_steps=1000,
+    warmup_steps=1000,
+    output_path=model_save_path,
+    optimizer_params={"lr": 5e-5},
+    use_amp=True,  # Set to True, if your GPU supports FP16 cores
+)
+
+########### Load the model and evaluate on test set
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/MLM/README.md
+++ b/examples/unsupervised_learning/MLM/README.md
+# MLM
+Masked Language Model (MLM) is the process how BERT was pre-trained. It has been shown, that to continue MLM on your own data can improve performances (see [Don't Stop Pretraining: Adapt Language Models to Domains and Tasks](https://arxiv.org/abs/2004.10964)). In our [TSDAE-paper](https://arxiv.org/abs/2104.06979) we also show that MLM is a powerful pre-training strategy for learning sentence embeddings. This is especially  the case when you work on some specialized domain.
+
+**Note:** Only running MLM will not yield good sentence embeddings. But you can first tune your favorite transformer model with MLM on your domain specific data. Then you can fine-tune the model with the labeled data you have or using other data sets like [NLI](../../training/nli/README.md), [Paraphrases](../../training/paraphrases/README.md), or [STS](../../training/sts/README.md). 
+
+![MLM working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MLM.png)
+
+
+## Running MLM
+
+The **[train_mlm.py](train_mlm.py)** script provides an easy option to run MLM on your data. You run this script by:
+```bash
+python train_mlm.py distilbert-base path/train.txt
+ ```
+
+You can also provide an optional dev dataset:
+```bash
+python train_mlm.py distilbert-base path/train.txt path/dev.txt
+ ```
+
+Each line in train.txt / dev.txt is interpreted as one input for the transformer network, i.e. as one sentence or paragraph.
+
+
+For more information how to run MLM with huggingface transformers, see the [Language model training examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling).
--- a/examples/unsupervised_learning/MLM/train_mlm.py
+++ b/examples/unsupervised_learning/MLM/train_mlm.py
+"""
+This file runs Masked Language Model. You provide a training file. Each line is interpreted as a sentence / paragraph.
+Optionally, you can also provide a dev file.
+
+The fine-tuned model is stored in the output/model_name folder.
+
+Usage:
+python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]
+"""
+
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
+from transformers import Trainer, TrainingArguments
+import sys
+import gzip
+from datetime import datetime
+
+if len(sys.argv) < 3:
+    print("Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]")
+    exit()
+
+model_name = sys.argv[1]
+per_device_train_batch_size = 64
+
+save_steps = 1000  # Save model every 1k steps
+num_train_epochs = 3  # Number of epochs
+use_fp16 = False  # Set to True, if your GPU supports FP16 operations
+max_length = 100  # Max length for a text input
+do_whole_word_mask = True  # If set to true, whole words are masked
+mlm_prob = 0.15  # Probability that a word is replaced by a [MASK] token
+
+# Load the model
+model = AutoModelForMaskedLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+
+output_dir = "output/{}-{}".format(model_name.replace("/", "_"), datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+print("Save checkpoints to:", output_dir)
+
+
+##### Load our training datasets
+
+train_sentences = []
+train_path = sys.argv[2]
+with gzip.open(train_path, "rt", encoding="utf8") if train_path.endswith(".gz") else open(
+    train_path, "r", encoding="utf8"
+) as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+print("Train sentences:", len(train_sentences))
+
+dev_sentences = []
+if len(sys.argv) >= 4:
+    dev_path = sys.argv[3]
+    with gzip.open(dev_path, "rt", encoding="utf8") if dev_path.endswith(".gz") else open(
+        dev_path, "r", encoding="utf8"
+    ) as fIn:
+        for line in fIn:
+            line = line.strip()
+            if len(line) >= 10:
+                dev_sentences.append(line)
+
+print("Dev sentences:", len(dev_sentences))
+
+
+# A dataset wrapper, that tokenizes our data on-the-fly
+class TokenizedSentencesDataset:
+    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
+        self.tokenizer = tokenizer
+        self.sentences = sentences
+        self.max_length = max_length
+        self.cache_tokenization = cache_tokenization
+
+    def __getitem__(self, item):
+        if not self.cache_tokenization:
+            return self.tokenizer(
+                self.sentences[item],
+                add_special_tokens=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_special_tokens_mask=True,
+            )
+
+        if isinstance(self.sentences[item], str):
+            self.sentences[item] = self.tokenizer(
+                self.sentences[item],
+                add_special_tokens=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_special_tokens_mask=True,
+            )
+        return self.sentences[item]
+
+    def __len__(self):
+        return len(self.sentences)
+
+
+train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
+dev_dataset = (
+    TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True)
+    if len(dev_sentences) > 0
+    else None
+)
+
+
+##### Training arguments
+
+if do_whole_word_mask:
+    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
+else:
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
+
+training_args = TrainingArguments(
+    output_dir=output_dir,
+    overwrite_output_dir=True,
+    num_train_epochs=num_train_epochs,
+    evaluation_strategy="steps" if dev_dataset is not None else "no",
+    per_device_train_batch_size=per_device_train_batch_size,
+    eval_steps=save_steps,
+    save_steps=save_steps,
+    logging_steps=save_steps,
+    save_total_limit=1,
+    prediction_loss_only=True,
+    fp16=use_fp16,
+)
+
+trainer = Trainer(
+    model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=dev_dataset
+)
+
+print("Save tokenizer to:", output_dir)
+tokenizer.save_pretrained(output_dir)
+
+trainer.train()
+
+print("Save model to:", output_dir)
+model.save_pretrained(output_dir)
+
+print("Training done")
--- a/examples/unsupervised_learning/README.md
+++ b/examples/unsupervised_learning/README.md
+# Unsupervised Learning
+
+This page contains a collection of unsupervised learning methods to learn sentence embeddings. The methods have in common that they **do not require labeled training data**. Instead, they can learn semantically meaningful sentence embeddings just from the text itself.
+
+**Note:** Unsupervised learning approaches are still an activate research area and in many cases the models perform rather poorly compared to models that are using training pairs as provided in our [training data collection](https://huggingface.co/datasets/sentence-transformers/embedding-training-data). A better approach is **[Domain Adaptation](../domain_adaptation/README.md)** where you combine unsupervised learning on your target domain with existent labeled data. This gives the best performance on your specific corpus.
+
+
+## TSDAE
+In our work [TSDAE (Transformer-based Denoising AutoEncoder)](https://arxiv.org/abs/2104.06979) we present an unsupervised sentence embedding learning method based on denoising auto-encoders:
+
+![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/TSDAE.png)
+
+We add noise to the input text, in our case, we delete about 60% of the words in the text. The encoder maps this input to a fixed-sized sentence embeddings. A decoder then tries to re-create the original text without the noise. Later, we use the encoder as the sentence embedding methods.
+
+See **[TSDAE](TSDAE/README.md)** for more information and training examples.
+
+## SimCSE
+
+Gao et al. present in [SimCSE: Simple Contrastive Learning of Sentence Embeddings](https://arxiv.org/abs/2104.08821) a method that passes the same sentence twice to the sentence embedding encoder. Due to the drop-out, it will be encoded at slightly different positions in vector space. 
+
+The distance between these two embeddings will be minimized, while the distance to other embeddings of the other sentences in the same batch will be maximized.
+
+![SimCSE working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SimCSE.png)
+
+See **[SimCSE](SimCSE/README.md)** for more information and training examples.
+
+## CT
+
+Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF) an unsupervised method that uses two models: If the same sentences are passed to Model1 and Model2, then the respective sentence embeddings should get a large dot-score. If the different sentences are passed, then the sentence embeddings should get a low score.
+
+![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
+
+See **[CT](CT/README.md)** for more information and training examples.
+
+## CT (In-Batch Negative Sampling)
+
+The CT method from Carlsson et al. provides sentence pairs to the two models. This can be improved by using in-batch negative sampling: Model1 and Model2 both encode the same set of sentences. We maximize the scores for matching indexes (i.e. Model1(S_i) and Model2(S_i)) while we minimize the scores for different indexes (i.e. Model1(S_i) and Model2(S_j) for i != j).
+
+See **[CT_In-Batch_Negatives](CT_In-Batch_Negatives/README.md)** for more information and training examples.
+
+## Masked Language Model (MLM)
+BERT showed that Masked Language Model (MLM) is a powerful pre-training approach. It is advisable to first run MLM a large dataset from your domain before you do fine-tuning. See **[MLM](MLM/README.md)** for more information and training examples.
+
+## GenQ
+
+In our paper [BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models](https://arxiv.org/abs/2104.08663)  we present a method to learn a semantic search method by generating queries for given passages. This method has been improved in [GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577).
+
+We pass all passages in our collection through a trained T5 model, which generates potential queries from users. We then use these (query, passage) pairs to train a SentenceTransformer model.
+
+![Query Generation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/query-generation.png)
+
+See **[GenQ](query_generation/README.md)** for more information and training examples. See **[GPL](../domain_adaptation/README.md)** for the improved version that uses a multi-step training approach. 
+
+## GPL
+
+In [GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577) we show an improved version of GenQ, which combines the generation with negative mining and pseudo labeling using a Cross-Encoder. It leads to significantly improved results. See **[Domain Adaptation](../domain_adaptation/README.md)** for more information.
+
+![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_architecture.png) 
+
+
+## Performance Comparison
+
+In our paper 
+[TSDAE](https://arxiv.org/abs/2104.06979) we compare approaches for sentence embedding tasks, and in [GPL](https://arxiv.org/abs/2112.07577) we compare them for semantic search tasks (given a query, find relevant passages). While the unsupervised approach achieve acceptable performances for sentence embedding tasks, they perform poorly for semantic search tasks.
+
+
+
--- a/examples/unsupervised_learning/SimCSE/README.md
+++ b/examples/unsupervised_learning/SimCSE/README.md
+# SimCSE
+Gao et al. present in [SimCSE](https://arxiv.org/abs/2104.08821) a simple method to train sentence embeddings without having training data. 
+
+The idea is to encode the same sentence twice. Due to the used dropout in transformer models, both sentence embeddings will be at slightly different positions. The distance between these two embeddings will be minimized, while the distance to other embeddings of the other sentences in the same batch will be maximized (they serve as negative examples).
+
+![SimCSE working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SimCSE.png)
+
+## Usage with SentenceTransformers
+SentenceTransformers implements the [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), which makes training with SimCSE trivial:
+
+```python
+from sentence_transformers import SentenceTransformer, InputExample
+from sentence_transformers import models, losses
+from torch.utils.data import DataLoader
+
+# Define your sentence transformer model using CLS pooling
+model_name = "distilroberta-base"
+word_embedding_model = models.Transformer(model_name, max_seq_length=32)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Define a list with sentences (1k - 100k sentences)
+train_sentences = [
+    "Your set of sentences",
+    "Model will automatically add the noise",
+    "And re-construct it",
+    "You should provide at least 1k sentences",
+]
+
+# Convert train sentences to sentence pairs
+train_data = [InputExample(texts=[s, s]) for s in train_sentences]
+
+# DataLoader to batch your data
+train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
+
+# Use the denoising auto-encoder loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+# Call the fit method
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)], epochs=1, show_progress_bar=True
+)
+
+model.save("output/simcse-model")
+``` 
+
+## SimCSE from Sentences File
+
+**[train_simcse_from_file.py](train_simcse_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+
+## Training Examples
+- **[train_askubuntu_simcse.py](train_askubuntu_simcse.py)** - Shows the example how to train with SimCSE on the [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu).
+- **[train_stsb_simcse.py](train_stsb_simcse.py)** - This script uses 1 million sentences and evaluates SimCSE on the [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
+
+## Ablation Study
+We use the evaluation setup proposed in our [TSDAE paper](https://arxiv.org/abs/2104.06979).
+
+
+Using mean pooling, with max_seq_length=32 and batch_size=128
+
+| Base Model | AskUbuntu Test-Performance (MAP) |
+| ---- | :----: |
+| distilbert-base-uncased | 53.59 |
+| bert-base-uncased | 54.89 |
+| **distilroberta-base** | **56.16** |
+| roberta-base | 55.89 |
+
+Using mean pooling, with max_seq_length=32 and distilroberta-base model.
+
+| Batch Size | AskUbuntu Test-Performance (MAP) |
+| ---- | :----: |
+| 128 | 56.16 |
+| 256 | 56.63  |
+| **512** | **56.69** |
+
+Using max_seq_length=32, distilroberta-base model, and 512 batch size.
+
+| Pooling Mode | AskUbuntu Test-Performance (MAP) |
+| ---- | :----: |
+| **Mean pooling** |  **56.69** |
+| CLS pooling | 56.56  |
+| Max pooling | 52.91 |
+
+
+**Note:**
+This is a re-implementation of SimCSE within sentence-transformers. For the official CT code, see: [princeton-nlp/SimCSE](https://github.com/princeton-nlp/SimCSE)
\ No newline at end of file
--- a/examples/unsupervised_learning/SimCSE/train_askubuntu_simcse.py
+++ b/examples/unsupervised_learning/SimCSE/train_askubuntu_simcse.py
+from sentence_transformers import SentenceTransformer, LoggingHandler, InputExample
+from sentence_transformers import models, util, evaluation, losses
+import logging
+import os
+import gzip
+from torch.utils.data import DataLoader
+from datetime import datetime
+
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Some training parameters. For the example, we use a batch_size of 128, a max sentence length (max_seq_length)
+# of 32 word pieces and as model roberta-base
+model_name = "roberta-base"
+batch_size = 128
+max_seq_length = 32
+num_epochs = 1
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "data/askubuntu"
+output_path = "output/askubuntu-simcse-{}-{}-{}".format(
+    model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(InputExample(texts=[sentence, sentence]))
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train the model #################
+
+# As Loss function, we use MultipleNegativesRankingLoss
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+
+logging.info("Dev performance before training")
+dev_evaluator(model)
+
+warmup_steps = int(num_epochs * len(train_dataloader) * 0.1)
+
+logging.info("Start training")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    evaluation_steps=100,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=output_path,
+    show_progress_bar=True,
+    use_amp=True,  # If your GPU does not have FP16 cores, set use_amp=False
+)
+
+latest_output_path = output_path + "-latest"
+model.save(latest_output_path)
+
+### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
+model = SentenceTransformer(latest_output_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py
+++ b/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_simcse_from_file.py path/to/sentences.txt
+
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, InputExample
+import logging
+from datetime import datetime
+import gzip
+import sys
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Training parameters
+model_name = "distilroberta-base"
+train_batch_size = 128
+max_seq_length = 32
+num_epochs = 1
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_simcse{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Read the train corpus  #################
+train_samples = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_samples.append(InputExample(texts=[line, line]))
+
+
+logging.info("Train sentences: {}".format(len(train_samples)))
+
+# We train our model using the MultipleNegativesRankingLoss
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    optimizer_params={"lr": 5e-5},
+    checkpoint_path=model_output_path,
+    show_progress_bar=True,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/SimCSE/train_stsb_simcse.py
+++ b/examples/unsupervised_learning/SimCSE/train_stsb_simcse.py
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Training parameters
+model_name = "distilbert-base-uncased"
+train_batch_size = 128
+num_epochs = 1
+max_seq_length = 32
+
+# Save path to store our model
+model_save_path = "output/training_stsb_simcse-{}-{}-{}".format(
+    model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
+train_samples = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_samples.append(InputExample(texts=[line, line]))
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+
+# We train our model using the MultipleNegativesRankingLoss
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+evaluation_steps = int(len(train_dataloader) * 0.1)  # Evaluate every 10% of the data
+logging.info("Training sentences: {}".format(len(train_samples)))
+logging.info("Warmup-steps: {}".format(warmup_steps))
+logging.info("Performance before training")
+dev_evaluator(model)
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=evaluation_steps,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    optimizer_params={"lr": 5e-5},
+    use_amp=True,  # Set to True, if your GPU supports FP16 cores
+)
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/unsupervised_learning/TSDAE/README.md
+++ b/examples/unsupervised_learning/TSDAE/README.md
+# TSDAE
+
+This section shows an example, of how we can train an unsupervised [TSDAE (Transformer-based Denoising AutoEncoder)](https://arxiv.org/abs/2104.06979) model with pure sentences as training data.
+
+## Background 
+During training, TSDAE encodes damaged sentences into fixed-sized vectors and requires the decoder to reconstruct the original sentences from these sentence embeddings. For good reconstruction quality, the semantics must be captured well in the sentence embeddings from the encoder. Later, at inference, we only use the encoder for creating sentence embeddings. The architecture is illustrated in the figure below:
+
+![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/TSDAE.png)
+
+## Unsupervised Training with TSDAE
+Training with TSDAE is simple. You just need a set of sentences:
+```python
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, util, datasets, evaluation, losses
+from torch.utils.data import DataLoader
+
+# Define your sentence transformer model using CLS pooling
+model_name = "bert-base-uncased"
+word_embedding_model = models.Transformer(model_name)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Define a list with sentences (1k - 100k sentences)
+train_sentences = [
+    "Your set of sentences",
+    "Model will automatically add the noise",
+    "And re-construct it",
+    "You should provide at least 1k sentences",
+]
+
+# Create the special denoising dataset that adds noise on-the-fly
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+
+# DataLoader to batch your data
+train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
+
+# Use the denoising auto-encoder loss
+train_loss = losses.DenoisingAutoEncoderLoss(
+    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
+)
+
+# Call the fit method
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    weight_decay=0,
+    scheduler="constantlr",
+    optimizer_params={"lr": 3e-5},
+    show_progress_bar=True,
+)
+
+model.save("output/tsdae-model")
+``` 
+
+## TSDAE from Sentences File
+
+**[train_tsdae_from_file.py](train_tsdae_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+
+## TSDAE on AskUbuntu Dataset
+The [AskUbuntu dataset](https://github.com/taolei87/askubuntu) is a manually annotated dataset for the [AskUbuntu forum](https://askubuntu.com/). For 400 questions, experts annotated for each question 20 other questions if they are related or not. The questions are split into train & development set.
+
+**[train_askubuntu_tsdae.py](train_askubuntu_tsdae.py)** - Shows an example how to train a model on AskUbuntu using only sentences without any labels. As sentences, we use the titles that are not used in the dev / test set. 
+
+| Model | MAP-Score on test set |
+| ---- | :----: |
+| TSDAE (bert-base-uncased) | 59.4 |
+| **pretrained SentenceTransformer models** | |
+| nli-bert-base | 50.7 |
+| paraphrase-distilroberta-base-v1 | 54.8 |
+| stsb-roberta-large | 54.6 |
+
+----------------------
+
+
+
+## TSDAE as Pre-Training Task
+As we show in our [TSDAE paper](https://arxiv.org/abs/2104.06979), TSDAE also a powerful pre-training method outperforming the classical Mask Language Model (MLM) pre-training task.
+
+You first train your model with the TSDAE loss. After you have trained for a certain number of steps / after the model converges, you can further fine-tune your pre-trained model like any other SentenceTransformer model.
+
+
+## Citation
+If you use the code for augmented sbert, feel free to cite our publication [TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning](https://arxiv.org/abs/2104.06979):
+```bibtex 
+@article{wang-2021-TSDAE,
+    title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
+    author = "Wang, Kexin and Reimers, Nils and  Gurevych, Iryna", 
+    journal= "arXiv preprint arXiv:2104.06979",
+    month = "4",
+    year = "2021",
+    url = "https://arxiv.org/abs/2104.06979",
+}
+```
--- a/examples/unsupervised_learning/TSDAE/eval_askubuntu.py
+++ b/examples/unsupervised_learning/TSDAE/eval_askubuntu.py
+"""
+This scripts runs the evaluation (dev & test) for the AskUbuntu dataset
+
+Usage:
+python eval_askubuntu.py [sbert_model_name_or_path]
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import util, evaluation
+import logging
+import os
+import gzip
+import sys
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model = SentenceTransformer(sys.argv[1])
+
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "askubuntu"
+training_corpus = os.path.join(askubuntu_folder, "train.unsupervised.txt")
+
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+
+logging.info("Dev performance before training")
+dev_evaluator(model)
+
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+logging.info("Test performance before training")
+test_evaluator(model)
--- a/examples/unsupervised_learning/TSDAE/train_askubuntu_tsdae.py
+++ b/examples/unsupervised_learning/TSDAE/train_askubuntu_tsdae.py
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, util, datasets, evaluation, losses
+import logging
+import os
+import gzip
+from torch.utils.data import DataLoader
+from datetime import datetime
+import sys
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "data/askubuntu"
+result_folder = "output/askubuntu-tsdae-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+batch_size = 8
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(sentence)
+
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+model_name = sys.argv[1] if len(sys.argv) >= 2 else "bert-base-uncased"
+word_embedding_model = models.Transformer(model_name)
+# Apply **cls** pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
+# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+
+logging.info("Dev performance before training")
+dev_evaluator(model)
+
+total_steps = 20000
+logging.info("Start training")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    evaluation_steps=1000,
+    epochs=1,
+    steps_per_epoch=total_steps,
+    weight_decay=0,
+    scheduler="constantlr",
+    optimizer_params={"lr": 3e-5},
+    output_path=result_folder,
+    show_progress_bar=True,
+)
--- a/examples/unsupervised_learning/TSDAE/train_stsb_tsdae.py
+++ b/examples/unsupervised_learning/TSDAE/train_stsb_tsdae.py
+from torch.utils.data import DataLoader
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Training parameters
+model_name = "bert-base-uncased"
+train_batch_size = 8
+num_epochs = 1
+max_seq_length = 75
+
+# Save path to store our model
+model_save_path = "output/training_stsb_tsdae-{}-{}-{}".format(
+    model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+# Defining our sentence transformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
+train_sentences = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+
+# We train our model using the MultipleNegativesRankingLoss
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, drop_last=True)
+train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
+
+
+evaluation_steps = 1000
+logging.info("Training sentences: {}".format(len(train_sentences)))
+logging.info("Performance before training")
+dev_evaluator(model)
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=evaluation_steps,
+    output_path=model_save_path,
+    weight_decay=0,
+    warmup_steps=100,
+    optimizer_params={"lr": 3e-5},
+    use_amp=True,  # Set to True, if your GPU supports FP16 cores
+)
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py
+++ b/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_tsdae_from_file.py path/to/sentences.txt
+
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, datasets, losses
+import logging
+import gzip
+from torch.utils.data import DataLoader
+from datetime import datetime
+import sys
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Train Parameters
+model_name = "bert-base-uncased"
+batch_size = 8
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_tsdae{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+################# Read the train corpus  #################
+train_sentences = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+word_embedding_model = models.Transformer(model_name)
+# Apply **cls** pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
+# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
+
+
+logging.info("Start training")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    weight_decay=0,
+    scheduler="constantlr",
+    optimizer_params={"lr": 3e-5},
+    show_progress_bar=True,
+    checkpoint_path=model_output_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/query_generation/1_programming_query_generation.py
+++ b/examples/unsupervised_learning/query_generation/1_programming_query_generation.py
+"""
+In this example we train a semantic search model to search through Wikipedia
+articles about programming articles & technologies.
+
+We use the text paragraphs from the following Wikipedia articles:
+Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
+
+In:
+1_programming_query_generation.py - We generate queries for all paragraphs from these articles
+2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
+3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
+"""
+
+import json
+import gzip
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+import tqdm
+import os
+from sentence_transformers import util
+
+paragraphs = set()
+
+# We use the Wikipedia articles of certain programming languages
+corpus_filepath = "wiki-programmming-20210101.jsonl.gz"
+if not os.path.exists(corpus_filepath):
+    util.http_get("https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz", corpus_filepath)
+
+with gzip.open(corpus_filepath, "rt") as fIn:
+    for line in fIn:
+        data = json.loads(line.strip())
+
+        for p in data["paragraphs"]:
+            if len(p) > 100:  # Only take paragraphs with at least 100 chars
+                paragraphs.add(p)
+
+paragraphs = list(paragraphs)
+print("Paragraphs:", len(paragraphs))
+
+
+# Now we load the model that is able to generate queries given a paragraph.
+# This model was trained on the MS MARCO dataset, a dataset with 500k
+# queries from Bing and the respective relevant passage
+tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model.eval()
+
+# Select the device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+
+# Parameters for generation
+batch_size = 8  # Batch size
+num_queries = 5  # Number of queries to generate for every paragraph
+max_length_paragraph = 300  # Max length for paragraph
+max_length_query = 64  # Max length for output query
+
+# Now for every paragraph in our corpus, we generate the queries
+with open("generated_queries.tsv", "w") as fOut:
+    for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
+        sub_paragraphs = paragraphs[start_idx : start_idx + batch_size]
+        inputs = tokenizer.prepare_seq2seq_batch(
+            sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors="pt"
+        ).to(device)
+        outputs = model.generate(
+            **inputs, max_length=max_length_query, do_sample=True, top_p=0.95, num_return_sequences=num_queries
+        )
+
+        for idx, out in enumerate(outputs):
+            query = tokenizer.decode(out, skip_special_tokens=True)
+            para = sub_paragraphs[int(idx / num_queries)]
+            fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))
--- a/examples/unsupervised_learning/query_generation/2_programming_train_bi-encoder.py
+++ b/examples/unsupervised_learning/query_generation/2_programming_train_bi-encoder.py
+"""
+In this example we train a semantic search model to search through Wikipedia
+articles about programming articles & technologies.
+
+We use the text paragraphs from the following Wikipedia articles:
+Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
+
+In:
+1_programming_query_generation.py - We generate queries for all paragraphs from these articles
+2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
+3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
+"""
+
+from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
+import os
+
+
+train_examples = []
+with open("generated_queries.tsv") as fIn:
+    for line in fIn:
+        query, paragraph = line.strip().split("\t", maxsplit=1)
+        train_examples.append(InputExample(texts=[query, paragraph]))
+
+# For the MultipleNegativesRankingLoss, it is important
+# that the batch does not contain duplicate entries, i.e.
+# no two equal queries and no two equal paragraphs.
+# To ensure this, we use a special data loader
+train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=64)
+
+# Now we create a SentenceTransformer model from scratch
+word_emb = models.Transformer("distilbert-base-uncased")
+pooling = models.Pooling(word_emb.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_emb, pooling])
+
+# MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
+# and trains the model so that is is suitable for semantic search
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+
+# Tune the model
+num_epochs = 3
+warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    show_progress_bar=True,
+)
+
+os.makedirs("output", exist_ok=True)
+model.save("output/programming-model")