first add

24db6dab · Rayyyyy · 24db6dab · 24db6dab · 24db6dab · 24db6dab
Commit 24db6dab authored Apr 12, 2024 by Rayyyyy
20 changed files
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_askubuntu_ct-improved.py
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_askubuntu_ct-improved.py
+from sentence_transformers import SentenceTransformer, LoggingHandler, InputExample
+from sentence_transformers import models, util, evaluation, losses
+import logging
+import os
+import gzip
+from datetime import datetime
+from torch.utils.data import DataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Some training parameters. We use a batch size of 16, for every positive example we include 8-1=7 negative examples
+# Sentences are truncated to 75 word pieces
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 128
+epochs = 1
+max_seq_length = 75
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "askubuntu"
+output_path = "output/train_askubuntu_ct-improved-{}-{}-{}".format(
+    model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(InputExample(texts=[sentence, sentence]))
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train the model #################
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+
+
+logging.info("Start training")
+
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    warmup_steps=100,
+    use_amp=True,  # Set to True, if your GPU has optimized FP16 cores
+)
+
+latest_output_path = output_path + "-latest"
+model.save(latest_output_path)
+
+### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
+model = SentenceTransformer(latest_output_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_ct-improved_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+CT will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_ct_from_file.py path/to/sentences.txt
+
+"""
+
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer
+import logging
+from datetime import datetime
+import gzip
+import sys
+import tqdm
+from torch.utils.data import DataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 128
+num_epochs = 1
+max_seq_length = 75
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_ct-improved{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Read the train corpus  #################
+train_sentences = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+
+logging.info("Train sentences: {}".format(len(train_sentences)))
+
+# A regular torch DataLoader and as loss we use losses.ContrastiveTensionLossInBatchNegatives
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
+
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    optimizer_params={"lr": 5e-5},
+    checkpoint_path=model_output_path,
+    show_progress_bar=True,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/CT_In-Batch_Negatives/train_stsb_ct-improved.py
+++ b/examples/unsupervised_learning/CT_In-Batch_Negatives/train_stsb_ct-improved.py
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, InputExample
+from sentence_transformers import losses
+import os
+import gzip
+import csv
+from datetime import datetime
+import logging
+from torch.utils.data import DataLoader
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+## Training parameters
+model_name = "distilbert-base-uncased"
+batch_size = 128
+epochs = 1
+max_seq_length = 75
+
+# Save path to store our model
+model_save_path = "output/training_stsb_ct-improved-{}-{}".format(
+    model_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+################# Train sentences #################
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_sentences are simply your list of sentences
+train_sentences = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        train_sentences.append(InputExample(texts=[line.strip(), line.strip()]))
+
+################# Download and load STSb #################
+data_folder = "data/stsbenchmark"
+sts_dataset_path = f"{data_folder}/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+
+################# Initialize an SBERT model #################
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# For ContrastiveTension we need a special data loader to construct batches with the desired properties
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+
+# As loss, we losses.ContrastiveTensionLoss
+train_loss = losses.ContrastiveTensionLossInBatchNegatives(model, scale=1, similarity_fct=util.dot_score)
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=1,
+    evaluation_steps=1000,
+    warmup_steps=1000,
+    output_path=model_save_path,
+    optimizer_params={"lr": 5e-5},
+    use_amp=True,  # Set to True, if your GPU supports FP16 cores
+)
+
+########### Load the model and evaluate on test set
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/MLM/README.md
+++ b/examples/unsupervised_learning/MLM/README.md
+# MLM
+Masked Language Model (MLM) is the process how BERT was pre-trained. It has been shown, that to continue MLM on your own data can improve performances (see [Don't Stop Pretraining: Adapt Language Models to Domains and Tasks](https://arxiv.org/abs/2004.10964)). In our [TSDAE-paper](https://arxiv.org/abs/2104.06979) we also show that MLM is a powerful pre-training strategy for learning sentence embeddings. This is especially  the case when you work on some specialized domain.
+
+**Note:** Only running MLM will not yield good sentence embeddings. But you can first tune your favorite transformer model with MLM on your domain specific data. Then you can fine-tune the model with the labeled data you have or using other data sets like [NLI](../../training/nli/README.md), [Paraphrases](../../training/paraphrases/README.md), or [STS](../../training/sts/README.md). 
+
+![MLM working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MLM.png)
+
+
+## Running MLM
+
+The **[train_mlm.py](train_mlm.py)** script provides an easy option to run MLM on your data. You run this script by:
+```bash
+python train_mlm.py distilbert-base path/train.txt
+ ```
+
+You can also provide an optional dev dataset:
+```bash
+python train_mlm.py distilbert-base path/train.txt path/dev.txt
+ ```
+
+Each line in train.txt / dev.txt is interpreted as one input for the transformer network, i.e. as one sentence or paragraph.
+
+
+For more information how to run MLM with huggingface transformers, see the [Language model training examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling).
--- a/examples/unsupervised_learning/MLM/train_mlm.py
+++ b/examples/unsupervised_learning/MLM/train_mlm.py
+"""
+This file runs Masked Language Model. You provide a training file. Each line is interpreted as a sentence / paragraph.
+Optionally, you can also provide a dev file.
+
+The fine-tuned model is stored in the output/model_name folder.
+
+Usage:
+python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]
+"""
+
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
+from transformers import Trainer, TrainingArguments
+import sys
+import gzip
+from datetime import datetime
+
+if len(sys.argv) < 3:
+    print("Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]")
+    exit()
+
+model_name = sys.argv[1]
+per_device_train_batch_size = 64
+
+save_steps = 1000  # Save model every 1k steps
+num_train_epochs = 3  # Number of epochs
+use_fp16 = False  # Set to True, if your GPU supports FP16 operations
+max_length = 100  # Max length for a text input
+do_whole_word_mask = True  # If set to true, whole words are masked
+mlm_prob = 0.15  # Probability that a word is replaced by a [MASK] token
+
+# Load the model
+model = AutoModelForMaskedLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+
+output_dir = "output/{}-{}".format(model_name.replace("/", "_"), datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+print("Save checkpoints to:", output_dir)
+
+
+##### Load our training datasets
+
+train_sentences = []
+train_path = sys.argv[2]
+with gzip.open(train_path, "rt", encoding="utf8") if train_path.endswith(".gz") else open(
+    train_path, "r", encoding="utf8"
+) as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+print("Train sentences:", len(train_sentences))
+
+dev_sentences = []
+if len(sys.argv) >= 4:
+    dev_path = sys.argv[3]
+    with gzip.open(dev_path, "rt", encoding="utf8") if dev_path.endswith(".gz") else open(
+        dev_path, "r", encoding="utf8"
+    ) as fIn:
+        for line in fIn:
+            line = line.strip()
+            if len(line) >= 10:
+                dev_sentences.append(line)
+
+print("Dev sentences:", len(dev_sentences))
+
+
+# A dataset wrapper, that tokenizes our data on-the-fly
+class TokenizedSentencesDataset:
+    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
+        self.tokenizer = tokenizer
+        self.sentences = sentences
+        self.max_length = max_length
+        self.cache_tokenization = cache_tokenization
+
+    def __getitem__(self, item):
+        if not self.cache_tokenization:
+            return self.tokenizer(
+                self.sentences[item],
+                add_special_tokens=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_special_tokens_mask=True,
+            )
+
+        if isinstance(self.sentences[item], str):
+            self.sentences[item] = self.tokenizer(
+                self.sentences[item],
+                add_special_tokens=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_special_tokens_mask=True,
+            )
+        return self.sentences[item]
+
+    def __len__(self):
+        return len(self.sentences)
+
+
+train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
+dev_dataset = (
+    TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True)
+    if len(dev_sentences) > 0
+    else None
+)
+
+
+##### Training arguments
+
+if do_whole_word_mask:
+    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
+else:
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
+
+training_args = TrainingArguments(
+    output_dir=output_dir,
+    overwrite_output_dir=True,
+    num_train_epochs=num_train_epochs,
+    evaluation_strategy="steps" if dev_dataset is not None else "no",
+    per_device_train_batch_size=per_device_train_batch_size,
+    eval_steps=save_steps,
+    save_steps=save_steps,
+    logging_steps=save_steps,
+    save_total_limit=1,
+    prediction_loss_only=True,
+    fp16=use_fp16,
+)
+
+trainer = Trainer(
+    model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=dev_dataset
+)
+
+print("Save tokenizer to:", output_dir)
+tokenizer.save_pretrained(output_dir)
+
+trainer.train()
+
+print("Save model to:", output_dir)
+model.save_pretrained(output_dir)
+
+print("Training done")
--- a/examples/unsupervised_learning/README.md
+++ b/examples/unsupervised_learning/README.md
+# Unsupervised Learning
+
+This page contains a collection of unsupervised learning methods to learn sentence embeddings. The methods have in common that they **do not require labeled training data**. Instead, they can learn semantically meaningful sentence embeddings just from the text itself.
+
+**Note:** Unsupervised learning approaches are still an activate research area and in many cases the models perform rather poorly compared to models that are using training pairs as provided in our [training data collection](https://huggingface.co/datasets/sentence-transformers/embedding-training-data). A better approach is **[Domain Adaptation](../domain_adaptation/README.md)** where you combine unsupervised learning on your target domain with existent labeled data. This gives the best performance on your specific corpus.
+
+
+## TSDAE
+In our work [TSDAE (Transformer-based Denoising AutoEncoder)](https://arxiv.org/abs/2104.06979) we present an unsupervised sentence embedding learning method based on denoising auto-encoders:
+
+![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/TSDAE.png)
+
+We add noise to the input text, in our case, we delete about 60% of the words in the text. The encoder maps this input to a fixed-sized sentence embeddings. A decoder then tries to re-create the original text without the noise. Later, we use the encoder as the sentence embedding methods.
+
+See **[TSDAE](TSDAE/README.md)** for more information and training examples.
+
+## SimCSE
+
+Gao et al. present in [SimCSE: Simple Contrastive Learning of Sentence Embeddings](https://arxiv.org/abs/2104.08821) a method that passes the same sentence twice to the sentence embedding encoder. Due to the drop-out, it will be encoded at slightly different positions in vector space. 
+
+The distance between these two embeddings will be minimized, while the distance to other embeddings of the other sentences in the same batch will be maximized.
+
+![SimCSE working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SimCSE.png)
+
+See **[SimCSE](SimCSE/README.md)** for more information and training examples.
+
+## CT
+
+Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF) an unsupervised method that uses two models: If the same sentences are passed to Model1 and Model2, then the respective sentence embeddings should get a large dot-score. If the different sentences are passed, then the sentence embeddings should get a low score.
+
+![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
+
+See **[CT](CT/README.md)** for more information and training examples.
+
+## CT (In-Batch Negative Sampling)
+
+The CT method from Carlsson et al. provides sentence pairs to the two models. This can be improved by using in-batch negative sampling: Model1 and Model2 both encode the same set of sentences. We maximize the scores for matching indexes (i.e. Model1(S_i) and Model2(S_i)) while we minimize the scores for different indexes (i.e. Model1(S_i) and Model2(S_j) for i != j).
+
+See **[CT_In-Batch_Negatives](CT_In-Batch_Negatives/README.md)** for more information and training examples.
+
+## Masked Language Model (MLM)
+BERT showed that Masked Language Model (MLM) is a powerful pre-training approach. It is advisable to first run MLM a large dataset from your domain before you do fine-tuning. See **[MLM](MLM/README.md)** for more information and training examples.
+
+## GenQ
+
+In our paper [BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models](https://arxiv.org/abs/2104.08663)  we present a method to learn a semantic search method by generating queries for given passages. This method has been improved in [GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577).
+
+We pass all passages in our collection through a trained T5 model, which generates potential queries from users. We then use these (query, passage) pairs to train a SentenceTransformer model.
+
+![Query Generation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/query-generation.png)
+
+See **[GenQ](query_generation/README.md)** for more information and training examples. See **[GPL](../domain_adaptation/README.md)** for the improved version that uses a multi-step training approach. 
+
+## GPL
+
+In [GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577) we show an improved version of GenQ, which combines the generation with negative mining and pseudo labeling using a Cross-Encoder. It leads to significantly improved results. See **[Domain Adaptation](../domain_adaptation/README.md)** for more information.
+
+![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_architecture.png) 
+
+
+## Performance Comparison
+
+In our paper 
+[TSDAE](https://arxiv.org/abs/2104.06979) we compare approaches for sentence embedding tasks, and in [GPL](https://arxiv.org/abs/2112.07577) we compare them for semantic search tasks (given a query, find relevant passages). While the unsupervised approach achieve acceptable performances for sentence embedding tasks, they perform poorly for semantic search tasks.
+
+
+
--- a/examples/unsupervised_learning/SimCSE/README.md
+++ b/examples/unsupervised_learning/SimCSE/README.md
+# SimCSE
+Gao et al. present in [SimCSE](https://arxiv.org/abs/2104.08821) a simple method to train sentence embeddings without having training data. 
+
+The idea is to encode the same sentence twice. Due to the used dropout in transformer models, both sentence embeddings will be at slightly different positions. The distance between these two embeddings will be minimized, while the distance to other embeddings of the other sentences in the same batch will be maximized (they serve as negative examples).
+
+![SimCSE working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SimCSE.png)
+
+## Usage with SentenceTransformers
+SentenceTransformers implements the [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), which makes training with SimCSE trivial:
+
+```python
+from sentence_transformers import SentenceTransformer, InputExample
+from sentence_transformers import models, losses
+from torch.utils.data import DataLoader
+
+# Define your sentence transformer model using CLS pooling
+model_name = "distilroberta-base"
+word_embedding_model = models.Transformer(model_name, max_seq_length=32)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Define a list with sentences (1k - 100k sentences)
+train_sentences = [
+    "Your set of sentences",
+    "Model will automatically add the noise",
+    "And re-construct it",
+    "You should provide at least 1k sentences",
+]
+
+# Convert train sentences to sentence pairs
+train_data = [InputExample(texts=[s, s]) for s in train_sentences]
+
+# DataLoader to batch your data
+train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
+
+# Use the denoising auto-encoder loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+# Call the fit method
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)], epochs=1, show_progress_bar=True
+)
+
+model.save("output/simcse-model")
+``` 
+
+## SimCSE from Sentences File
+
+**[train_simcse_from_file.py](train_simcse_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+
+## Training Examples
+- **[train_askubuntu_simcse.py](train_askubuntu_simcse.py)** - Shows the example how to train with SimCSE on the [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu).
+- **[train_stsb_simcse.py](train_stsb_simcse.py)** - This script uses 1 million sentences and evaluates SimCSE on the [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
+
+## Ablation Study
+We use the evaluation setup proposed in our [TSDAE paper](https://arxiv.org/abs/2104.06979).
+
+
+Using mean pooling, with max_seq_length=32 and batch_size=128
+
+| Base Model | AskUbuntu Test-Performance (MAP) |
+| ---- | :----: |
+| distilbert-base-uncased | 53.59 |
+| bert-base-uncased | 54.89 |
+| **distilroberta-base** | **56.16** |
+| roberta-base | 55.89 |
+
+Using mean pooling, with max_seq_length=32 and distilroberta-base model.
+
+| Batch Size | AskUbuntu Test-Performance (MAP) |
+| ---- | :----: |
+| 128 | 56.16 |
+| 256 | 56.63  |
+| **512** | **56.69** |
+
+Using max_seq_length=32, distilroberta-base model, and 512 batch size.
+
+| Pooling Mode | AskUbuntu Test-Performance (MAP) |
+| ---- | :----: |
+| **Mean pooling** |  **56.69** |
+| CLS pooling | 56.56  |
+| Max pooling | 52.91 |
+
+
+**Note:**
+This is a re-implementation of SimCSE within sentence-transformers. For the official CT code, see: [princeton-nlp/SimCSE](https://github.com/princeton-nlp/SimCSE)
\ No newline at end of file
--- a/examples/unsupervised_learning/SimCSE/train_askubuntu_simcse.py
+++ b/examples/unsupervised_learning/SimCSE/train_askubuntu_simcse.py
+from sentence_transformers import SentenceTransformer, LoggingHandler, InputExample
+from sentence_transformers import models, util, evaluation, losses
+import logging
+import os
+import gzip
+from torch.utils.data import DataLoader
+from datetime import datetime
+
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Some training parameters. For the example, we use a batch_size of 128, a max sentence length (max_seq_length)
+# of 32 word pieces and as model roberta-base
+model_name = "roberta-base"
+batch_size = 128
+max_seq_length = 32
+num_epochs = 1
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "data/askubuntu"
+output_path = "output/askubuntu-simcse-{}-{}-{}".format(
+    model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(InputExample(texts=[sentence, sentence]))
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train the model #################
+
+# As Loss function, we use MultipleNegativesRankingLoss
+train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+
+logging.info("Dev performance before training")
+dev_evaluator(model)
+
+warmup_steps = int(num_epochs * len(train_dataloader) * 0.1)
+
+logging.info("Start training")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    evaluation_steps=100,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=output_path,
+    show_progress_bar=True,
+    use_amp=True,  # If your GPU does not have FP16 cores, set use_amp=False
+)
+
+latest_output_path = output_path + "-latest"
+model.save(latest_output_path)
+
+### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
+model = SentenceTransformer(latest_output_path)
+test_evaluator(model)
--- a/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py
+++ b/examples/unsupervised_learning/SimCSE/train_simcse_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_simcse_from_file.py path/to/sentences.txt
+
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, InputExample
+import logging
+from datetime import datetime
+import gzip
+import sys
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Training parameters
+model_name = "distilroberta-base"
+train_batch_size = 128
+max_seq_length = 32
+num_epochs = 1
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_simcse{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Read the train corpus  #################
+train_samples = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_samples.append(InputExample(texts=[line, line]))
+
+
+logging.info("Train sentences: {}".format(len(train_samples)))
+
+# We train our model using the MultipleNegativesRankingLoss
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    optimizer_params={"lr": 5e-5},
+    checkpoint_path=model_output_path,
+    show_progress_bar=True,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/SimCSE/train_stsb_simcse.py
+++ b/examples/unsupervised_learning/SimCSE/train_stsb_simcse.py
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Training parameters
+model_name = "distilbert-base-uncased"
+train_batch_size = 128
+num_epochs = 1
+max_seq_length = 32
+
+# Save path to store our model
+model_save_path = "output/training_stsb_simcse-{}-{}-{}".format(
+    model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
+train_samples = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_samples.append(InputExample(texts=[line, line]))
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+
+# We train our model using the MultipleNegativesRankingLoss
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+evaluation_steps = int(len(train_dataloader) * 0.1)  # Evaluate every 10% of the data
+logging.info("Training sentences: {}".format(len(train_samples)))
+logging.info("Warmup-steps: {}".format(warmup_steps))
+logging.info("Performance before training")
+dev_evaluator(model)
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=evaluation_steps,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    optimizer_params={"lr": 5e-5},
+    use_amp=True,  # Set to True, if your GPU supports FP16 cores
+)
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/unsupervised_learning/TSDAE/README.md
+++ b/examples/unsupervised_learning/TSDAE/README.md
+# TSDAE
+
+This section shows an example, of how we can train an unsupervised [TSDAE (Transformer-based Denoising AutoEncoder)](https://arxiv.org/abs/2104.06979) model with pure sentences as training data.
+
+## Background 
+During training, TSDAE encodes damaged sentences into fixed-sized vectors and requires the decoder to reconstruct the original sentences from these sentence embeddings. For good reconstruction quality, the semantics must be captured well in the sentence embeddings from the encoder. Later, at inference, we only use the encoder for creating sentence embeddings. The architecture is illustrated in the figure below:
+
+![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/TSDAE.png)
+
+## Unsupervised Training with TSDAE
+Training with TSDAE is simple. You just need a set of sentences:
+```python
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, util, datasets, evaluation, losses
+from torch.utils.data import DataLoader
+
+# Define your sentence transformer model using CLS pooling
+model_name = "bert-base-uncased"
+word_embedding_model = models.Transformer(model_name)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Define a list with sentences (1k - 100k sentences)
+train_sentences = [
+    "Your set of sentences",
+    "Model will automatically add the noise",
+    "And re-construct it",
+    "You should provide at least 1k sentences",
+]
+
+# Create the special denoising dataset that adds noise on-the-fly
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+
+# DataLoader to batch your data
+train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
+
+# Use the denoising auto-encoder loss
+train_loss = losses.DenoisingAutoEncoderLoss(
+    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
+)
+
+# Call the fit method
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    weight_decay=0,
+    scheduler="constantlr",
+    optimizer_params={"lr": 3e-5},
+    show_progress_bar=True,
+)
+
+model.save("output/tsdae-model")
+``` 
+
+## TSDAE from Sentences File
+
+**[train_tsdae_from_file.py](train_tsdae_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+
+## TSDAE on AskUbuntu Dataset
+The [AskUbuntu dataset](https://github.com/taolei87/askubuntu) is a manually annotated dataset for the [AskUbuntu forum](https://askubuntu.com/). For 400 questions, experts annotated for each question 20 other questions if they are related or not. The questions are split into train & development set.
+
+**[train_askubuntu_tsdae.py](train_askubuntu_tsdae.py)** - Shows an example how to train a model on AskUbuntu using only sentences without any labels. As sentences, we use the titles that are not used in the dev / test set. 
+
+| Model | MAP-Score on test set |
+| ---- | :----: |
+| TSDAE (bert-base-uncased) | 59.4 |
+| **pretrained SentenceTransformer models** | |
+| nli-bert-base | 50.7 |
+| paraphrase-distilroberta-base-v1 | 54.8 |
+| stsb-roberta-large | 54.6 |
+
+----------------------
+
+
+
+## TSDAE as Pre-Training Task
+As we show in our [TSDAE paper](https://arxiv.org/abs/2104.06979), TSDAE also a powerful pre-training method outperforming the classical Mask Language Model (MLM) pre-training task.
+
+You first train your model with the TSDAE loss. After you have trained for a certain number of steps / after the model converges, you can further fine-tune your pre-trained model like any other SentenceTransformer model.
+
+
+## Citation
+If you use the code for augmented sbert, feel free to cite our publication [TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning](https://arxiv.org/abs/2104.06979):
+```bibtex 
+@article{wang-2021-TSDAE,
+    title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
+    author = "Wang, Kexin and Reimers, Nils and  Gurevych, Iryna", 
+    journal= "arXiv preprint arXiv:2104.06979",
+    month = "4",
+    year = "2021",
+    url = "https://arxiv.org/abs/2104.06979",
+}
+```
--- a/examples/unsupervised_learning/TSDAE/eval_askubuntu.py
+++ b/examples/unsupervised_learning/TSDAE/eval_askubuntu.py
+"""
+This scripts runs the evaluation (dev & test) for the AskUbuntu dataset
+
+Usage:
+python eval_askubuntu.py [sbert_model_name_or_path]
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import util, evaluation
+import logging
+import os
+import gzip
+import sys
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model = SentenceTransformer(sys.argv[1])
+
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "askubuntu"
+training_corpus = os.path.join(askubuntu_folder, "train.unsupervised.txt")
+
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+
+logging.info("Dev performance before training")
+dev_evaluator(model)
+
+test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
+logging.info("Test performance before training")
+test_evaluator(model)
--- a/examples/unsupervised_learning/TSDAE/train_askubuntu_tsdae.py
+++ b/examples/unsupervised_learning/TSDAE/train_askubuntu_tsdae.py
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, util, datasets, evaluation, losses
+import logging
+import os
+import gzip
+from torch.utils.data import DataLoader
+from datetime import datetime
+import sys
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+################# Download AskUbuntu and extract training corpus  #################
+askubuntu_folder = "data/askubuntu"
+result_folder = "output/askubuntu-tsdae-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+batch_size = 8
+
+## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
+for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
+    filepath = os.path.join(askubuntu_folder, filename)
+    if not os.path.exists(filepath):
+        util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
+
+# Read the corpus
+corpus = {}
+dev_test_ids = set()
+with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        splits = line.strip().split("\t")
+        id = splits[0]
+        title = splits[1]
+        corpus[id] = title
+
+
+# Read dev & test dataset
+def read_eval_dataset(filepath):
+    dataset = []
+    with open(filepath) as fIn:
+        for line in fIn:
+            query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
+            if len(relevant_id) == 0:  # Skip examples without relevant entries
+                continue
+
+            relevant_id = relevant_id.split(" ")
+            candidate_ids = candidate_ids.split(" ")
+            negative_ids = set(candidate_ids) - set(relevant_id)
+            dataset.append(
+                {
+                    "query": corpus[query_id],
+                    "positive": [corpus[pid] for pid in relevant_id],
+                    "negative": [corpus[pid] for pid in negative_ids],
+                }
+            )
+            dev_test_ids.add(query_id)
+            dev_test_ids.update(candidate_ids)
+    return dataset
+
+
+dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
+test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
+
+
+## Now we need a list of train sentences.
+## In this example we simply use all sentences that don't appear in the train/dev set
+train_sentences = []
+for id, sentence in corpus.items():
+    if id not in dev_test_ids:
+        train_sentences.append(sentence)
+
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+model_name = sys.argv[1] if len(sys.argv) >= 2 else "bert-base-uncased"
+word_embedding_model = models.Transformer(model_name)
+# Apply **cls** pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
+# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
+
+# Create a dev evaluator
+dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
+
+logging.info("Dev performance before training")
+dev_evaluator(model)
+
+total_steps = 20000
+logging.info("Start training")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    evaluation_steps=1000,
+    epochs=1,
+    steps_per_epoch=total_steps,
+    weight_decay=0,
+    scheduler="constantlr",
+    optimizer_params={"lr": 3e-5},
+    output_path=result_folder,
+    show_progress_bar=True,
+)
--- a/examples/unsupervised_learning/TSDAE/train_stsb_tsdae.py
+++ b/examples/unsupervised_learning/TSDAE/train_stsb_tsdae.py
+from torch.utils.data import DataLoader
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+from datetime import datetime
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Training parameters
+model_name = "bert-base-uncased"
+train_batch_size = 8
+num_epochs = 1
+max_seq_length = 75
+
+# Save path to store our model
+model_save_path = "output/training_stsb_tsdae-{}-{}-{}".format(
+    model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+# Defining our sentence transformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+# We use 1 Million sentences from Wikipedia to train our model
+wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
+if not os.path.exists(wikipedia_dataset_path):
+    util.http_get(
+        "https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
+        wikipedia_dataset_path,
+    )
+
+# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
+train_sentences = []
+with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+# Read STSbenchmark dataset and use it as development set
+logging.info("Read STSbenchmark dev dataset")
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+
+        if row["split"] == "dev":
+            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+        elif row["split"] == "test":
+            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
+
+dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    dev_samples, batch_size=train_batch_size, name="sts-dev"
+)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
+    test_samples, batch_size=train_batch_size, name="sts-test"
+)
+
+# We train our model using the MultipleNegativesRankingLoss
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, drop_last=True)
+train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
+
+
+evaluation_steps = 1000
+logging.info("Training sentences: {}".format(len(train_sentences)))
+logging.info("Performance before training")
+dev_evaluator(model)
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=evaluation_steps,
+    output_path=model_save_path,
+    weight_decay=0,
+    warmup_steps=100,
+    optimizer_params={"lr": 3e-5},
+    use_amp=True,  # Set to True, if your GPU supports FP16 cores
+)
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+test_evaluator(model, output_path=model_save_path)
--- a/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py
+++ b/examples/unsupervised_learning/TSDAE/train_tsdae_from_file.py
+"""
+This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
+
+TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
+
+Usage:
+python train_tsdae_from_file.py path/to/sentences.txt
+
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+from sentence_transformers import models, datasets, losses
+import logging
+import gzip
+from torch.utils.data import DataLoader
+from datetime import datetime
+import sys
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Train Parameters
+model_name = "bert-base-uncased"
+batch_size = 8
+
+# Input file path (a text file, each line a sentence)
+if len(sys.argv) < 2:
+    print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
+    exit()
+
+filepath = sys.argv[1]
+
+# Save path to store our model
+output_name = ""
+if len(sys.argv) >= 3:
+    output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
+
+model_output_path = "output/train_tsdae{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+################# Read the train corpus  #################
+train_sentences = []
+with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+    filepath, encoding="utf8"
+) as fIn:
+    for line in tqdm.tqdm(fIn, desc="Read file"):
+        line = line.strip()
+        if len(line) >= 10:
+            train_sentences.append(line)
+
+
+logging.info("{} train sentences".format(len(train_sentences)))
+
+################# Initialize an SBERT model #################
+
+word_embedding_model = models.Transformer(model_name)
+# Apply **cls** pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
+# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
+train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
+
+
+logging.info("Start training")
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=1,
+    weight_decay=0,
+    scheduler="constantlr",
+    optimizer_params={"lr": 3e-5},
+    show_progress_bar=True,
+    checkpoint_path=model_output_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 cores
+)
--- a/examples/unsupervised_learning/query_generation/1_programming_query_generation.py
+++ b/examples/unsupervised_learning/query_generation/1_programming_query_generation.py
+"""
+In this example we train a semantic search model to search through Wikipedia
+articles about programming articles & technologies.
+
+We use the text paragraphs from the following Wikipedia articles:
+Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
+
+In:
+1_programming_query_generation.py - We generate queries for all paragraphs from these articles
+2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
+3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
+"""
+
+import json
+import gzip
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+import tqdm
+import os
+from sentence_transformers import util
+
+paragraphs = set()
+
+# We use the Wikipedia articles of certain programming languages
+corpus_filepath = "wiki-programmming-20210101.jsonl.gz"
+if not os.path.exists(corpus_filepath):
+    util.http_get("https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz", corpus_filepath)
+
+with gzip.open(corpus_filepath, "rt") as fIn:
+    for line in fIn:
+        data = json.loads(line.strip())
+
+        for p in data["paragraphs"]:
+            if len(p) > 100:  # Only take paragraphs with at least 100 chars
+                paragraphs.add(p)
+
+paragraphs = list(paragraphs)
+print("Paragraphs:", len(paragraphs))
+
+
+# Now we load the model that is able to generate queries given a paragraph.
+# This model was trained on the MS MARCO dataset, a dataset with 500k
+# queries from Bing and the respective relevant passage
+tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model.eval()
+
+# Select the device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+
+# Parameters for generation
+batch_size = 8  # Batch size
+num_queries = 5  # Number of queries to generate for every paragraph
+max_length_paragraph = 300  # Max length for paragraph
+max_length_query = 64  # Max length for output query
+
+# Now for every paragraph in our corpus, we generate the queries
+with open("generated_queries.tsv", "w") as fOut:
+    for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
+        sub_paragraphs = paragraphs[start_idx : start_idx + batch_size]
+        inputs = tokenizer.prepare_seq2seq_batch(
+            sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors="pt"
+        ).to(device)
+        outputs = model.generate(
+            **inputs, max_length=max_length_query, do_sample=True, top_p=0.95, num_return_sequences=num_queries
+        )
+
+        for idx, out in enumerate(outputs):
+            query = tokenizer.decode(out, skip_special_tokens=True)
+            para = sub_paragraphs[int(idx / num_queries)]
+            fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))
--- a/examples/unsupervised_learning/query_generation/2_programming_train_bi-encoder.py
+++ b/examples/unsupervised_learning/query_generation/2_programming_train_bi-encoder.py
+"""
+In this example we train a semantic search model to search through Wikipedia
+articles about programming articles & technologies.
+
+We use the text paragraphs from the following Wikipedia articles:
+Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
+
+In:
+1_programming_query_generation.py - We generate queries for all paragraphs from these articles
+2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
+3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
+"""
+
+from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
+import os
+
+
+train_examples = []
+with open("generated_queries.tsv") as fIn:
+    for line in fIn:
+        query, paragraph = line.strip().split("\t", maxsplit=1)
+        train_examples.append(InputExample(texts=[query, paragraph]))
+
+# For the MultipleNegativesRankingLoss, it is important
+# that the batch does not contain duplicate entries, i.e.
+# no two equal queries and no two equal paragraphs.
+# To ensure this, we use a special data loader
+train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=64)
+
+# Now we create a SentenceTransformer model from scratch
+word_emb = models.Transformer("distilbert-base-uncased")
+pooling = models.Pooling(word_emb.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_emb, pooling])
+
+# MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
+# and trains the model so that is is suitable for semantic search
+train_loss = losses.MultipleNegativesRankingLoss(model)
+
+
+# Tune the model
+num_epochs = 3
+warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    show_progress_bar=True,
+)
+
+os.makedirs("output", exist_ok=True)
+model.save("output/programming-model")
--- a/examples/unsupervised_learning/query_generation/3_programming_semantic_search.py
+++ b/examples/unsupervised_learning/query_generation/3_programming_semantic_search.py
+"""
+In this example we train a semantic search model to search through Wikipedia
+articles about programming articles & technologies.
+
+We use the text paragraphs from the following Wikipedia articles:
+Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
+
+In:
+1_programming_query_generation.py - We generate queries for all paragraphs from these articles
+2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
+3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
+"""
+
+from sentence_transformers import SentenceTransformer, util
+import gzip
+import json
+import os
+
+# Load the model we trained in 2_programming_train_bi-encoder.py
+model = SentenceTransformer("output/programming-model")
+
+# Load the corpus
+docs = []
+corpus_filepath = "wiki-programmming-20210101.jsonl.gz"
+if not os.path.exists(corpus_filepath):
+    util.http_get("https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz", corpus_filepath)
+
+with gzip.open(corpus_filepath, "rt") as fIn:
+    for line in fIn:
+        data = json.loads(line.strip())
+        title = data["title"]
+        for p in data["paragraphs"]:
+            if len(p) > 100:  # Only take paragraphs with at least 100 chars
+                docs.append((title, p))
+
+paragraph_emb = model.encode([d[1] for d in docs], convert_to_tensor=True)
+
+print("Available Wikipedia Articles:")
+print(", ".join(sorted(list(set([d[0] for d in docs])))))
+
+# Example for semantic search
+while True:
+    query = input("Query: ")
+    query_emb = model.encode(query, convert_to_tensor=True)
+    hits = util.semantic_search(query_emb, paragraph_emb, top_k=3)[0]
+
+    for hit in hits:
+        doc = docs[hit["corpus_id"]]
+        print("{:.2f}\t{}\t\t{}".format(hit["score"], doc[0], doc[1]))
+
+    print("\n=================\n")
--- a/examples/unsupervised_learning/query_generation/README.md
+++ b/examples/unsupervised_learning/query_generation/README.md
+# GenQ
+
+In our paper [BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models](https://arxiv.org/abs/2104.08663) we presented a method to adapt a model for [asymmetric semantic search](../../applications/semantic-search/) without for a corpus without labeled training data.
+
+## Background
+In [asymmetric semantic search](../../applications/semantic-search/), the user provides a (short) query like some keywords or a question. We then want to retrieve a longer text passage that provides the answer.
+
+For example:
+```
+query: What is Python?
+passage to retrieve: Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.
+```
+
+We showed how to train such models if sufficient training data (query & relevant passage) is available here: [Training MS MARCO dataset](../../training/ms_marco) 
+
+In this tutorial, we show to train such models if  **no training data is available**, i.e., if you don't have thousands of labeled query & relevant passage pairs.
+
+## Overview
+
+We use **synthetic query generation** to achieve our goal. We start with the passage from our document collection and create for these possible queries users might ask / might search for.
+
+![Query Generation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/query-generation.png)
+
+
+For example, we have the following text passage:
+```
+ Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.
+```
+
+We pass this passage through a specially trained [T5 model](https://arxiv.org/abs/1910.10683) which generates possible queries for us. For the above passage, it might generate these queries:
+- What is python
+- definition python
+- what language uses whitespaces
+
+
+We then use these generated queries to create our training set:
+```
+(What is python, Python is an interpreted...)
+(definition python, Python is an interpreted...)
+(what language uses whitespaces, Python is an interpreted...)
+````
+
+And train our SentenceTransformer bi-encoder with it.
+
+## Query Generation
+
+In [BeIR](https://huggingface.co/BeIR) we provide different models that can be used for query generation. In this example, we use the T5 model that was trained by [docTTTTTquery](https://github.com/castorini/docTTTTTquery):
+
+```python
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+
+tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model.eval()
+
+para = "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."
+
+input_ids = tokenizer.encode(para, return_tensors="pt")
+with torch.no_grad():
+    outputs = model.generate(
+        input_ids=input_ids,
+        max_length=64,
+        do_sample=True,
+        top_p=0.95,
+        num_return_sequences=3,
+    )
+
+print("Paragraph:")
+print(para)
+
+print("\nGenerated Queries:")
+for i in range(len(outputs)):
+    query = tokenizer.decode(outputs[i], skip_special_tokens=True)
+    print(f"{i + 1}: {query}")
+```
+
+In the above code, we use [Top-p (nucleus) sampling](https://huggingface.co/blog/how-to-generate) which will randomly pick a word from a collection of likely words. As a consequence, the model will generate different queries each time.
+
+
+## Bi-Encoder Training
+
+With the generated queries, we can then train a bi-encoder using the use [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss).
+ 
+ ## Full Example
+We train a semantic search model to search through Wikipedia
+articles about programming articles & technologies. 
+
+We use the text paragraphs from the following Wikipedia articles:
+Assembly language, C , C# , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
+
+In:
+- [1_programming_query_generation.py](1_programming_query_generation.py) - We generate queries for all paragraphs from these articles
+- [2_programming_train_bi-encoder.py](2_programming_train_bi-encoder.py) - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
+- [3_programming_semantic_search.py](3_programming_semantic_search.py) - Shows how the trained model can be used for semantic search.
--- a/examples/unsupervised_learning/query_generation/example_query_generation.py
+++ b/examples/unsupervised_learning/query_generation/example_query_generation.py
+import torch
+import numpy as np
+import random
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+# Set all seeds to make output deterministic
+torch.manual_seed(0)
+np.random.seed(0)
+random.seed(0)
+
+
+# Paragraphs for which we want to generate queries
+paragraphs = [
+    "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.",
+    'Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming. Python is often described as a "batteries included" language due to its comprehensive standard library.',
+    "Python was created in the late 1980s, and first released in 1991, by Guido van Rossum as a successor to the ABC programming language. Python 2.0, released in 2000, introduced new features, such as list comprehensions, and a garbage collection system with reference counting, and was discontinued with version 2.7 in 2020. Python 3.0, released in 2008, was a major revision of the language that is not completely backward-compatible and much Python 2 code does not run unmodified on Python 3. With Python 2's end-of-life (and pip having dropped support in 2021), only Python 3.6.x and later are supported, with older versions still supporting e.g. Windows 7 (and old installers not restricted to 64-bit Windows).",
+    "Python interpreters are supported for mainstream operating systems and available for a few more (and in the past supported many more). A global community of programmers develops and maintains CPython, a free and open-source reference implementation. A non-profit organization, the Python Software Foundation, manages and directs resources for Python and CPython development.",
+    "As of January 2021, Python ranks third in TIOBE’s index of most popular programming languages, behind C and Java, having previously gained second place and their award for the most popularity gain for 2020.",
+    "Java is a class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let application developers write once, run anywhere (WORA), meaning that compiled Java code can run on all platforms that support Java without the need for recompilation. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019, Java was one of the most popular programming languages in use according to GitHub, particularly for client-server web applications, with a reported 9 million developers.",
+    "Java was originally developed by James Gosling at Sun Microsystems (which has since been acquired by Oracle) and released in 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GNU General Public License. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open source software and used by most developers and is the default JVM for almost all Linux distributions.",
+    "As of September 2020, the latest version is Java 15, with Java 11, a currently supported long-term support (LTS) version, released on September 25, 2018. Oracle released the last zero-cost public update for the legacy version Java 8 LTS in January 2019 for commercial use, although it will otherwise still support Java 8 with public updates for personal use indefinitely. Other vendors have begun to offer zero-cost builds of OpenJDK 8 and 11 that are still receiving security and other upgrades.",
+    "Oracle (and others) highly recommend uninstalling outdated versions of Java because of serious risks due to unresolved security issues. Since Java 9, 10, 12, 13, and 14 are no longer supported, Oracle advises its users to immediately transition to the latest version (currently Java 15) or an LTS release.",
+]
+
+
+# For available models for query generation, see: https://huggingface.co/BeIR/
+# Here, we use a T5-large model was trained on the MS MARCO dataset
+tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
+model.eval()
+
+# Select the device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+
+
+# Iterate over the paragraphs and generate for each some queries
+with torch.no_grad():
+    for para in paragraphs:
+        input_ids = tokenizer.encode(para, return_tensors="pt").to(device)
+        outputs = model.generate(
+            input_ids=input_ids, max_length=64, do_sample=True, top_p=0.95, num_return_sequences=3
+        )
+
+        print("\nParagraph:")
+        print(para)
+
+        print("\nGenerated Queries:")
+        for i in range(len(outputs)):
+            query = tokenizer.decode(outputs[i], skip_special_tokens=True)
+            print(f"{i + 1}: {query}")
+
+"""
+Output of the script:
+
+Paragraph:
+Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.
+
+Generated Queries:
+1: what is python language used for
+2: what is python programming
+3: what language do i use for scripts
+
+Paragraph:
+Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming. Python is often described as a "batteries included" language due to its comprehensive standard library.
+
+Generated Queries:
+1: what is python language
+2: what programming paradigms do python support
+3: what programming languages use python
+
+Paragraph:
+Python was created in the late 1980s, and first released in 1991, by Guido van Rossum as a successor to the ABC programming language. Python 2.0, released in 2000, introduced new features, such as list comprehensions, and a garbage collection system with reference counting, and was discontinued with version 2.7 in 2020. Python 3.0, released in 2008, was a major revision of the language that is not completely backward-compatible and much Python 2 code does not run unmodified on Python 3. With Python 2's end-of-life (and pip having dropped support in 2021), only Python 3.6.x and later are supported, with older versions still supporting e.g. Windows 7 (and old installers not restricted to 64-bit Windows).
+
+Generated Queries:
+1: what year did python start
+2: when does the next python update release
+3: when did python come out?
+
+Paragraph:
+Python interpreters are supported for mainstream operating systems and available for a few more (and in the past supported many more). A global community of programmers develops and maintains CPython, a free and open-source reference implementation. A non-profit organization, the Python Software Foundation, manages and directs resources for Python and CPython development.
+
+Generated Queries:
+1: what platform is python available on
+2: what is python used for
+3: what is python?
+
+Paragraph:
+As of January 2021, Python ranks third in TIOBE’s index of most popular programming languages, behind C and Java, having previously gained second place and their award for the most popularity gain for 2020.
+
+Generated Queries:
+1: what is the most used programming language in the world
+2: what is python language
+3: what is the most popular programming language in the world?
+
+Paragraph:
+Java is a class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let application developers write once, run anywhere (WORA), meaning that compiled Java code can run on all platforms that support Java without the need for recompilation. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019, Java was one of the most popular programming languages in use according to GitHub, particularly for client-server web applications, with a reported 9 million developers.
+
+Generated Queries:
+1: java how java works
+2: what language is similar to java
+3: what is java language
+
+Paragraph:
+Java was originally developed by James Gosling at Sun Microsystems (which has since been acquired by Oracle) and released in 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GNU General Public License. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open source software and used by most developers and is the default JVM for almost all Linux distributions.
+
+Generated Queries:
+1: what is java created by
+2: when was java introduced to linux
+3: who developed java?
+
+Paragraph:
+As of September 2020, the latest version is Java 15, with Java 11, a currently supported long-term support (LTS) version, released on September 25, 2018. Oracle released the last zero-cost public update for the legacy version Java 8 LTS in January 2019 for commercial use, although it will otherwise still support Java 8 with public updates for personal use indefinitely. Other vendors have begun to offer zero-cost builds of OpenJDK 8 and 11 that are still receiving security and other upgrades.
+
+Generated Queries:
+1: what is the latest version of java
+2: what is the latest java version
+3: what is the latest version of java
+
+Paragraph:
+Oracle (and others) highly recommend uninstalling outdated versions of Java because of serious risks due to unresolved security issues. Since Java 9, 10, 12, 13, and 14 are no longer supported, Oracle advises its users to immediately transition to the latest version (currently Java 15) or an LTS release.
+
+Generated Queries:
+1: why is oracle not supported
+2: what version is oracle used in
+3: which java version is obsolete
+
+
+
+"""