First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/examples/training/matryoshka/matryoshka_nli.py
+++ b/examples/training/matryoshka/matryoshka_nli.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
+Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
+At every 10% training steps, the model is evaluated on the STS benchmark dataset
+
+Usage:
+python matryoshka_nli.py
+
+OR
+python matryoshka_nli.py pretrained_transformer_model_name
+"""
+
+import math
+from datasets import load_dataset
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
+train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
+max_seq_length = 75
+num_epochs = 1
+
+# Save path of the model
+model_save_path = (
+    "output/matryoshka_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+
+def add_to_samples(sent1, sent2, label):
+    if sent1 not in train_data:
+        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+    train_data[sent1][label].add(sent2)
+
+
+train_data = {}
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            sent1 = row["sentence1"].strip()
+            sent2 = row["sentence2"].strip()
+
+            add_to_samples(sent1, sent2, row["label"])
+            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
+
+
+train_samples = []
+for sent1, others in train_data.items():
+    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+        train_samples.append(
+            InputExample(
+                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
+            )
+        )
+        train_samples.append(
+            InputExample(
+                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
+            )
+        )
+
+logging.info("Train samples: {}".format(len(train_samples)))
+
+
+# Special data loader that avoid duplicates within a batch
+train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
+
+
+# Our training loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
+
+stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_dev["sentence1"],
+    stsb_dev["sentence2"],
+    [score / 5 for score in stsb_dev["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=int(len(train_dataloader) * 0.1),
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 operations
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_test["sentence1"],
+    stsb_test["sentence2"],
+    [score / 5 for score in stsb_test["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
+)
+test_evaluator(model, output_path=model_save_path)
+
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-matryoshka")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-matryoshka')`."
+    )
--- a/examples/training/matryoshka/matryoshka_nli_reduced_dim.py
+++ b/examples/training/matryoshka/matryoshka_nli_reduced_dim.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
+Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
+At every 10% training steps, the model is evaluated on the STS benchmark dataset
+
+The difference between this script and matryoshka_nli.py is that this script uses a reduced dimensionality of the base
+model by adding a Dense layer with `reduced_dim=256` output dimensions. This might be useful when your desired output
+dimensionality is lower than the base model's default output dimensionality.
+
+Usage:
+python matryoshka_nli_reduced_dim.py
+
+OR
+python matryoshka_nli_reduced_dim.py pretrained_transformer_model_name
+"""
+
+import math
+from datasets import load_dataset
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
+train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
+max_seq_length = 75
+num_epochs = 1
+reduced_dim = 256
+
+# Save path of the model
+model_save_path = (
+    "output/matryoshka_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
+dense = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=reduced_dim)
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense])
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+
+def add_to_samples(sent1, sent2, label):
+    if sent1 not in train_data:
+        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+    train_data[sent1][label].add(sent2)
+
+
+train_data = {}
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            sent1 = row["sentence1"].strip()
+            sent2 = row["sentence2"].strip()
+
+            add_to_samples(sent1, sent2, row["label"])
+            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
+
+
+train_samples = []
+for sent1, others in train_data.items():
+    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+        train_samples.append(
+            InputExample(
+                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
+            )
+        )
+        train_samples.append(
+            InputExample(
+                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
+            )
+        )
+
+logging.info("Train samples: {}".format(len(train_samples)))
+
+
+# Special data loader that avoid duplicates within a batch
+train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
+
+
+# Our training loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+train_loss = losses.MatryoshkaLoss(model, train_loss, [256, 128, 64, 32, 16])
+
+stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_dev["sentence1"],
+    stsb_dev["sentence2"],
+    [score / 5 for score in stsb_dev["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=int(len(train_dataloader) * 0.1),
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 operations
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_test["sentence1"],
+    stsb_test["sentence2"],
+    [score / 5 for score in stsb_test["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
+)
+test_evaluator(model, output_path=model_save_path)
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-matryoshka-{reduced_dim}")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-matryoshka-{reduced_dim}')`."
+    )
--- a/examples/training/matryoshka/matryoshka_sts.py
+++ b/examples/training/matryoshka/matryoshka_sts.py
+"""
+This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch.
+It uses MatryoshkaLoss with the powerful CoSENTLoss to train models that perform well at output dimensions [768, 512, 256, 128, 64].
+It generates sentence embeddings that can be compared using cosine-similarity to measure the similarity.
+
+Usage:
+python matryoshka_sts.py
+
+OR
+python matryoshka_sts.py pretrained_transformer_model_name
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
+
+# Read the dataset
+train_batch_size = 16
+num_epochs = 4
+model_save_path = (
+    "output/matryoshka_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.CoSENTLoss(model=model)
+train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
+
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training. We skip evaluation in this example
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-sts-matryoshka")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-sts-matryoshka')`."
+    )
--- a/examples/training/ms_marco/README.md
+++ b/examples/training/ms_marco/README.md
+# MS MARCO
+[MS MARCO Passage Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) is a large dataset to train models for information retrieval. It consists of about 500k real search queries from Bing search engine with the relevant text passage that answers the query.
+
+This pages shows how to **train** models (Cross-Encoder and Sentence Embedding Models) on this dataset so that it can be used for searching text passages given queries (key words, phrases or questions).
+
+If you are interested in how to use these models, see [Application - Retrieve & Re-Rank](../../applications/retrieve_rerank/README.md).
+
+There are **pre-trained models** available, which you can directly use without the need of training your own models. For more information, see: [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html) | [Pretrained Cross-Encoders](https://www.sbert.net/docs/pretrained_cross-encoders.html)
+
+
+
+## Bi-Encoder
+
+Cross-Encoder are only suitable for reranking a small set of passages. For retrieval of suitable documents from a large collection, we have to use a bi-encoder. The documents are independently encoded into fixed-sized embeddings. A query is embedded into the same vector space. Relevant documents can then be found by using dot-product.
+
+![BiEncoder](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/BiEncoder.png)
+
+
+There are two strategies to **train an bi-encoder** on the MS MARCO dataset:
+
+### MultipleNegativesRankingLoss
+ **Training code: [train_bi-encoder_mnrl.py](train_bi-encoder_mnrl.py)**
+
+When we use [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), we provide triplets: ``(query, positive_passage, negative_passage)`` where `positive_passage` is the relevant passage to the query and `negative_passage` is a non-relevant passage to the query.
+
+We compute the embeddings for all queries, positive passages, and negative passages in the corpus and then optimize the following objective: We want to have the `(query, positive_passage)` pair to be close in the vector space, while `(query, negative_passage)` should be distant in vector space.
+
+To further improve the training, we use **in-batch negatives**: 
+
+![MultipleNegativesRankingLoss](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png)
+
+We embed all `queries`, `positive_passages`, and `negative_passages` into the vector space. The matching `(query_i, positive_passage_i)` should be close, while there should be a large distance between a `query` and all other (positive/negative) passages from all other triplets in a batch. For a batch size of 64, we compare a query against 64+64=128 passages, from which only one passage should be close and the 127 others should be distant in vector space.
+
+One way to **improve training** is to choose really good negatives, also know as **hard negative**: The negative should look really similar to the positive passage, but it should not be relevant to the query.
+
+We find these hard negatives in the following way: We use existing retrieval systems (e.g. lexical search and other bi-encoder retrieval systems), and for each query we find the most relevant passages. We then use a powerful [Cross-Encoder](../../applications/cross-encoder/README.md) to score the found `(query, passage)` pairs. We provide scores for 160 million such pairs in our [msmarco-hard-negatives dataset](https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives).
+
+For MultipleNegativesRankingLoss, we must ensure that in the triplet `(query, positive_passage, negative_passage)` that the `negative_passage` is actually not relevant for the query. The MS MARCO dataset is sadly **highly redundant**, and even though that there is on average only one passage marked as relevant for a query, it actually contains many passages that humans would consider as relevant. We must ensure that these passages are **not passed as negatives**: We do this by ensuring a certain threshold in the CrossEncoder scores between the relevant passages and the mined hard negative. By default, we set a threshold of 3: If the `(query, positive_passage)` gets a score of 9 from the CrossEncoder, than we will only consider negatives with a score below 6 from the CrossEncoder. This threshold ensures that we actually use negatives in our triplets.
+
+
+### MarginMSE
+**Training code: [train_bi-encoder_margin-mse.py](train_bi-encoder_margin-mse.py)**
+
+[MarginMSELoss](https://www.sbert.net/docs/package_reference/losses.html#marginmseloss) is based on the paper of [Hofstätter et al](https://arxiv.org/abs/2010.02666). As for MultipleNegativesRankingLoss, we have triplets: `(query, passage1, passage2)`. In contrast to MultipleNegativesRankingLoss, `passage1` and `passage2` do not have to be strictly positive/negative, both can be relevant or not relevant for a given query.  
+
+We then compute the [Cross-Encoder](../../applications/cross-encoder/README.md) score for `(query, passage1)` and `(query, passage2)`. We provide scores for 160 million such pairs in our [msmarco-hard-negatives dataset](https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives). We then compute the distance: `CE_distance = CEScore(query, passage1) - CEScore(query, passage2)` 
+
+For our bi-encoder training, we encode `query`, `passage1`, and `passage2` into vector spaces and then measure the dot-product between  `(query, passage1)` and `(query, passage2)`. Again, we measure the distance: `BE_distance = DotScore(query, passage1) - DotScore(query, passage2)` 
+
+We then want to ensure that the distance predicted by the bi-encoder is close to the distance predicted by the cross-encoder, i.e., we optimize the mean-squared error (MSE) between `CE_distance` and `BE_distance`.
+
+An **advantage** of MarginMSELoss compared to MultipleNegativesRankingLoss is that we **don't require** a `positive` and `negative` passage. As mentioned before, MS MARCO is redundant, and many passages contain the same or similar content. With MarginMSELoss, we can train on two relevant passages without issues: In that case, the `CE_distance` will be smaller and we expect that our bi-encoder also puts both passages closer in the vector space.
+
+And **disadvantage** of MarginMSELoss is the slower training time: We need way more epochs to get good results. In MultipleNegativesRankingLoss, with a batch size of 64, we compare one query against 128 passages. With MarginMSELoss, we compare a query only against two passages.
+
+## Cross-Encoder
+A [Cross-Encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) accepts both inputs, the query and the possible relevant passage and returns a score between 0 and 1 how relevant the passage is for the given query.
+
+![CrossEncoder](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CrossEncoder.png)
+
+Cross-Encoders are often used for **re-ranking:** Given a list with possible relevant passages for a query, for example retrieved from BM25 / Elasticsearch, the cross-encoder re-ranks this list so that the most relevant passages are the top of the result list. 
+
+To **train an cross-encoder** on the MS MARCO dataset, see: 
+- **[train_cross-encoder_scratch.py](train_cross-encoder_scratch.py)** trains a cross-encoder from scratch using the provided data from the MS MARCO dataset.
+  
+## Cross-Encoder Knowledge Distillation
+![](https://github.com/UKPLab/sentence-transformers/raw/master/docs/img/msmarco-training-ce-distillation.png)
+- **[train_cross-encoder_kd.py](train_cross-encoder_kd.py)** uses a knowledge distillation setup: [Hostätter et al.](https://arxiv.org/abs/2010.02666) trained an ensemble of 3 (large) models for the MS MARCO dataset and predicted the scores for various (query, passage)-pairs (50% positive, 50% negative). In this example, we use knowledge distillation with a small & fast model and learn the logits scores from the teacher ensemble. This yields performances comparable to  large models, while being 18 times faster.
\ No newline at end of file
--- a/examples/training/ms_marco/eval_cross-encoder-trec-dl.py
+++ b/examples/training/ms_marco/eval_cross-encoder-trec-dl.py
+"""
+This file evaluates CrossEncoder on the TREC 2019 Deep Learning (DL) Track: https://arxiv.org/abs/2003.07820
+
+TREC 2019 DL is based on the corpus of MS Marco. MS Marco provides a sparse annotation, i.e., usually only a single
+passage is marked as relevant for a given query. Many other highly relevant passages are not annotated and hence are treated
+as an error if a model ranks those high.
+
+TREC DL instead annotated up to 200 passages per query for their relevance to a given query. It is better suited to estimate
+the model performance for the task of reranking in Information Retrieval.
+
+Run:
+python eval_cross-encoder-trec-dl.py cross-encoder-model-name
+
+"""
+
+import gzip
+from collections import defaultdict
+import logging
+import tqdm
+import numpy as np
+import sys
+import pytrec_eval
+from sentence_transformers import util, CrossEncoder
+import os
+
+data_folder = "trec2019-data"
+os.makedirs(data_folder, exist_ok=True)
+
+# Read test queries
+queries = {}
+queries_filepath = os.path.join(data_folder, "msmarco-test2019-queries.tsv.gz")
+if not os.path.exists(queries_filepath):
+    logging.info("Download " + os.path.basename(queries_filepath))
+    util.http_get(
+        "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", queries_filepath
+    )
+
+with gzip.open(queries_filepath, "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        queries[qid] = query
+
+# Read which passages are relevant
+relevant_docs = defaultdict(lambda: defaultdict(int))
+qrels_filepath = os.path.join(data_folder, "2019qrels-pass.txt")
+
+if not os.path.exists(qrels_filepath):
+    logging.info("Download " + os.path.basename(qrels_filepath))
+    util.http_get("https://trec.nist.gov/data/deep/2019qrels-pass.txt", qrels_filepath)
+
+
+with open(qrels_filepath) as fIn:
+    for line in fIn:
+        qid, _, pid, score = line.strip().split()
+        score = int(score)
+        if score > 0:
+            relevant_docs[qid][pid] = score
+
+# Only use queries that have at least one relevant passage
+relevant_qid = []
+for qid in queries:
+    if len(relevant_docs[qid]) > 0:
+        relevant_qid.append(qid)
+
+
+# Read the top 1000 passages that are supposed to be re-ranked
+passage_filepath = os.path.join(data_folder, "msmarco-passagetest2019-top1000.tsv.gz")
+
+if not os.path.exists(passage_filepath):
+    logging.info("Download " + os.path.basename(passage_filepath))
+    util.http_get(
+        "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz", passage_filepath
+    )
+
+
+passage_cand = {}
+with gzip.open(passage_filepath, "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, pid, query, passage = line.strip().split("\t")
+        if qid not in passage_cand:
+            passage_cand[qid] = []
+
+        passage_cand[qid].append([pid, passage])
+
+logging.info("Queries: {}".format(len(queries)))
+
+queries_result_list = []
+run = {}
+model = CrossEncoder(sys.argv[1], max_length=512)
+
+for qid in tqdm.tqdm(relevant_qid):
+    query = queries[qid]
+
+    cand = passage_cand[qid]
+    pids = [c[0] for c in cand]
+    corpus_sentences = [c[1] for c in cand]
+
+    cross_inp = [[query, sent] for sent in corpus_sentences]
+
+    if model.config.num_labels > 1:  # Cross-Encoder that predict more than 1 score, we use the last and apply softmax
+        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
+    else:
+        cross_scores = model.predict(cross_inp).tolist()
+
+    cross_scores_sparse = {}
+    for idx, pid in enumerate(pids):
+        cross_scores_sparse[pid] = cross_scores[idx]
+
+    sparse_scores = cross_scores_sparse
+    run[qid] = {}
+    for pid in sparse_scores:
+        run[qid][pid] = float(sparse_scores[pid])
+
+
+evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {"ndcg_cut.10"})
+scores = evaluator.evaluate(run)
+
+print("Queries:", len(relevant_qid))
+print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()]) * 100))
--- a/examples/training/ms_marco/eval_msmarco.py
+++ b/examples/training/ms_marco/eval_msmarco.py
+"""
+This script runs the evaluation of an SBERT msmarco model on the
+MS MARCO dev dataset and reports different performances metrices for cossine similarity & dot-product.
+
+Usage:
+python eval_msmarco.py model_name [max_corpus_size_in_thousands]
+"""
+
+from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation, util
+import logging
+import sys
+import os
+import tarfile
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Name of the SBERT model
+model_name = sys.argv[1]
+
+# You can limit the approx. max size of the corpus. Pass 100 as second parameter and the corpus has a size of approx 100k docs
+corpus_max_size = int(sys.argv[2]) * 1000 if len(sys.argv) >= 3 else 0
+
+
+####  Load model
+
+model = SentenceTransformer(model_name)
+
+### Data files
+data_folder = "msmarco-data"
+os.makedirs(data_folder, exist_ok=True)
+
+collection_filepath = os.path.join(data_folder, "collection.tsv")
+dev_queries_file = os.path.join(data_folder, "queries.dev.small.tsv")
+qrels_filepath = os.path.join(data_folder, "qrels.dev.tsv")
+
+### Download files if needed
+if not os.path.exists(collection_filepath) or not os.path.exists(dev_queries_file):
+    tar_filepath = os.path.join(data_folder, "collectionandqueries.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download: " + tar_filepath)
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+
+if not os.path.exists(qrels_filepath):
+    util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv", qrels_filepath)
+
+### Load data
+
+corpus = {}  # Our corpus pid => passage
+dev_queries = {}  # Our dev queries. qid => query
+dev_rel_docs = {}  # Mapping qid => set with relevant pids
+needed_pids = set()  # Passage IDs we need
+needed_qids = set()  # Query IDs we need
+
+# Load the 6980 dev queries
+with open(dev_queries_file, encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        dev_queries[qid] = query.strip()
+
+
+# Load which passages are relevant for which queries
+with open(qrels_filepath) as fIn:
+    for line in fIn:
+        qid, _, pid, _ = line.strip().split("\t")
+
+        if qid not in dev_queries:
+            continue
+
+        if qid not in dev_rel_docs:
+            dev_rel_docs[qid] = set()
+        dev_rel_docs[qid].add(pid)
+
+        needed_pids.add(pid)
+        needed_qids.add(qid)
+
+
+# Read passages
+with open(collection_filepath, encoding="utf8") as fIn:
+    for line in fIn:
+        pid, passage = line.strip().split("\t")
+        passage = passage
+
+        if pid in needed_pids or corpus_max_size <= 0 or len(corpus) <= corpus_max_size:
+            corpus[pid] = passage.strip()
+
+
+## Run evaluator
+logging.info("Queries: {}".format(len(dev_queries)))
+logging.info("Corpus: {}".format(len(corpus)))
+
+ir_evaluator = evaluation.InformationRetrievalEvaluator(
+    dev_queries,
+    corpus,
+    dev_rel_docs,
+    show_progress_bar=True,
+    corpus_chunk_size=100000,
+    precision_recall_at_k=[10, 100],
+    name="msmarco dev",
+)
+
+ir_evaluator(model)
--- a/examples/training/ms_marco/multilingual/README.md
+++ b/examples/training/ms_marco/multilingual/README.md
+# MS MARCO - Multilingual Training
+
+This folder demonstrates how to train a multi-lingual SBERT model for [semantic search](https://www.sbert.net/examples/applications/semantic-search/README.html) / [information retrieval](https://www.sbert.net/examples/applications/retrieve_rerank/README.html).
+
+As dataset, we use the [MS Marco Passage Ranking dataset](https://github.com/microsoft/MSMARCO-Passage-Ranking). It is a large dataset consisting of search queries from Bing search engine with the relevant text passage that answers the query.
+
+Sadly this dataset is only available in English. As there are no large, multi-lingual datasets available suitable to train a semantic search model, we will use **machine translation** to translate the training data.
+
+## Translating Data
+We will translate the queries and the passages using [EasyNMT](https://github.com/UKPLab/EasyNMT), which provides state-of-the-art machine translation to 150+ languages.
+
+Then, we will use [Multilingual Knowledge Distillation](https://www.sbert.net/examples/training/multilingual/README.html) and transform the English model trained on MS MARCO to a multi-lingual model.
+
--- a/examples/training/ms_marco/multilingual/translate_queries.py
+++ b/examples/training/ms_marco/multilingual/translate_queries.py
+"""
+This script translates the queries in the MS MARCO dataset to the defined target languages.
+
+For machine translation, we use EasyNMT: https://github.com/UKPLab/EasyNMT
+You can install it via: pip install easynmt
+
+Usage:
+python translate_queries [target_language]
+"""
+
+import os
+from sentence_transformers import LoggingHandler, util
+import logging
+import tarfile
+from easynmt import EasyNMT
+import sys
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+target_lang = sys.argv[1]
+output_folder = "multilingual-data"
+data_folder = "../msmarco-data"
+
+output_filename = os.path.join(output_folder, "train_queries.en-{}.tsv".format(target_lang))
+os.makedirs(output_folder, exist_ok=True)
+
+
+## Does the output file exists? If yes, read it so we can continue the translation
+translated_qids = set()
+if os.path.exists(output_filename):
+    with open(output_filename, "r", encoding="utf8") as fIn:
+        for line in fIn:
+            splits = line.strip().split("\t")
+            translated_qids.add(splits[0])
+
+### Now we read the MS Marco dataset
+os.makedirs(data_folder, exist_ok=True)
+
+# Read qrels file for relevant positives per query
+train_queries = {}
+qrels_train = os.path.join(data_folder, "qrels.train.tsv")
+if not os.path.exists(qrels_train):
+    util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv", qrels_train)
+
+with open(qrels_train) as fIn:
+    for line in fIn:
+        qid, _, pid, _ = line.strip().split()
+        if qid not in translated_qids:
+            train_queries[qid] = None
+
+# Read all queries
+queries_filepath = os.path.join(data_folder, "queries.train.tsv")
+if not os.path.exists(queries_filepath):
+    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download queries.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+
+with open(queries_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        if qid in train_queries:
+            train_queries[qid] = query.strip()
+
+
+qids = [qid for qid in train_queries if train_queries[qid] is not None]
+queries = [train_queries[qid] for qid in qids]
+
+# Define our translation model
+translation_model = EasyNMT("opus-mt")
+
+print("Start translation of {} queries.".format(len(queries)))
+print("This can take a while. But you can stop this script at any point")
+
+
+with open(output_filename, "a" if os.path.exists(output_filename) else "w", encoding="utf8") as fOut:
+    for qid, query, translated_query in zip(
+        qids,
+        queries,
+        translation_model.translate_stream(
+            queries,
+            source_lang="en",
+            target_lang=target_lang,
+            beam_size=2,
+            perform_sentence_splitting=False,
+            chunk_size=256,
+            batch_size=64,
+        ),
+    ):
+        fOut.write("{}\t{}\n".format(qid, translated_query.replace("\t", " ")))
+        fOut.flush()
--- a/examples/training/ms_marco/train_bi-encoder_margin-mse.py
+++ b/examples/training/ms_marco/train_bi-encoder_margin-mse.py
+import sys
+import json
+from torch.utils.data import DataLoader
+from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
+import logging
+from datetime import datetime
+import gzip
+import os
+import tarfile
+import tqdm
+from torch.utils.data import Dataset
+import random
+from shutil import copyfile
+import pickle
+import argparse
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--train_batch_size", default=64, type=int)
+parser.add_argument("--max_seq_length", default=300, type=int)
+parser.add_argument("--model_name", required=True)
+parser.add_argument("--max_passages", default=0, type=int)
+parser.add_argument("--epochs", default=30, type=int)
+parser.add_argument("--pooling", default="mean")
+parser.add_argument(
+    "--negs_to_use",
+    default=None,
+    help="From which systems should negatives be used? Multiple systems separated by comma. None = all",
+)
+parser.add_argument("--warmup_steps", default=1000, type=int)
+parser.add_argument("--lr", default=2e-5, type=float)
+parser.add_argument("--num_negs_per_system", default=5, type=int)
+parser.add_argument("--use_pre_trained_model", default=False, action="store_true")
+parser.add_argument("--use_all_queries", default=False, action="store_true")
+args = parser.parse_args()
+
+logging.info(str(args))
+
+
+# The  model we want to fine-tune
+train_batch_size = (
+    args.train_batch_size
+)  # Increasing the train batch size improves the model performance, but requires more GPU memory
+model_name = args.model_name
+max_passages = args.max_passages
+max_seq_length = args.max_seq_length  # Max length for passages. Increasing it, requires more GPU memory
+
+num_negs_per_system = (
+    args.num_negs_per_system
+)  # We used different systems to mine hard negatives. Number of hard negatives to add from each system
+num_epochs = args.epochs  # Number of epochs we want to train
+
+# Load our embedding model
+if args.use_pre_trained_model:
+    logging.info("use pretrained SBERT model")
+    model = SentenceTransformer(model_name)
+    model.max_seq_length = max_seq_length
+else:
+    logging.info("Create new SBERT model")
+    word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), args.pooling)
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+model_save_path = f'output/train_bi-encoder-margin_mse-{model_name.replace("/", "-")}-batch_size_{train_batch_size}-{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
+
+
+# Write self to path
+os.makedirs(model_save_path, exist_ok=True)
+
+train_script_path = os.path.join(model_save_path, "train_script.py")
+copyfile(__file__, train_script_path)
+with open(train_script_path, "a") as fOut:
+    fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
+
+
+### Now we read the MS Marco dataset
+data_folder = "msmarco-data"
+
+#### Read the corpus files, that contain all the passages. Store them in the corpus dict
+corpus = {}  # dict in the format: passage_id -> passage. Stores all existent passages
+collection_filepath = os.path.join(data_folder, "collection.tsv")
+if not os.path.exists(collection_filepath):
+    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download collection.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+logging.info("Read corpus: collection.tsv")
+with open(collection_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        pid, passage = line.strip().split("\t")
+        pid = int(pid)
+        corpus[pid] = passage
+
+
+### Read the train queries, store in queries dict
+queries = {}  # dict in the format: query_id -> query. Stores all training queries
+queries_filepath = os.path.join(data_folder, "queries.train.tsv")
+if not os.path.exists(queries_filepath):
+    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download queries.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+
+with open(queries_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        qid = int(qid)
+        queries[qid] = query
+
+
+# Load a dict (qid, pid) -> ce_score that maps query-ids (qid) and paragraph-ids (pid)
+# to the CrossEncoder score computed by the cross-encoder/ms-marco-MiniLM-L-6-v2 model
+ce_scores_file = os.path.join(data_folder, "cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz")
+if not os.path.exists(ce_scores_file):
+    logging.info("Download cross-encoder scores file")
+    util.http_get(
+        "https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz",
+        ce_scores_file,
+    )
+
+logging.info("Load CrossEncoder scores dict")
+with gzip.open(ce_scores_file, "rb") as fIn:
+    ce_scores = pickle.load(fIn)
+
+# As training data we use hard-negatives that have been mined using various systems
+hard_negatives_filepath = os.path.join(data_folder, "msmarco-hard-negatives.jsonl.gz")
+if not os.path.exists(hard_negatives_filepath):
+    logging.info("Download cross-encoder scores file")
+    util.http_get(
+        "https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/msmarco-hard-negatives.jsonl.gz",
+        hard_negatives_filepath,
+    )
+
+
+logging.info("Read hard negatives train file")
+train_queries = {}
+negs_to_use = None
+with gzip.open(hard_negatives_filepath, "rt") as fIn:
+    for line in tqdm.tqdm(fIn):
+        if max_passages > 0 and len(train_queries) >= max_passages:
+            break
+        data = json.loads(line)
+
+        # Get the positive passage ids
+        pos_pids = data["pos"]
+
+        # Get the hard negatives
+        neg_pids = set()
+        if negs_to_use is None:
+            if args.negs_to_use is not None:  # Use specific system for negatives
+                negs_to_use = args.negs_to_use.split(",")
+            else:  # Use all systems
+                negs_to_use = list(data["neg"].keys())
+            logging.info("Using negatives from the following systems:", negs_to_use)
+
+        for system_name in negs_to_use:
+            if system_name not in data["neg"]:
+                continue
+
+            system_negs = data["neg"][system_name]
+            negs_added = 0
+            for pid in system_negs:
+                if pid not in neg_pids:
+                    neg_pids.add(pid)
+                    negs_added += 1
+                    if negs_added >= num_negs_per_system:
+                        break
+
+        if args.use_all_queries or (len(pos_pids) > 0 and len(neg_pids) > 0):
+            train_queries[data["qid"]] = {
+                "qid": data["qid"],
+                "query": queries[data["qid"]],
+                "pos": pos_pids,
+                "neg": neg_pids,
+            }
+
+logging.info("Train queries: {}".format(len(train_queries)))
+
+
+# We create a custom MSMARCO dataset that returns triplets (query, positive, negative)
+# on-the-fly based on the information from the mined-hard-negatives jsonl file.
+class MSMARCODataset(Dataset):
+    def __init__(self, queries, corpus, ce_scores):
+        self.queries = queries
+        self.queries_ids = list(queries.keys())
+        self.corpus = corpus
+        self.ce_scores = ce_scores
+
+        for qid in self.queries:
+            self.queries[qid]["pos"] = list(self.queries[qid]["pos"])
+            self.queries[qid]["neg"] = list(self.queries[qid]["neg"])
+            random.shuffle(self.queries[qid]["neg"])
+
+    def __getitem__(self, item):
+        query = self.queries[self.queries_ids[item]]
+        query_text = query["query"]
+        qid = query["qid"]
+
+        if len(query["pos"]) > 0:
+            pos_id = query["pos"].pop(0)  # Pop positive and add at end
+            pos_text = self.corpus[pos_id]
+            query["pos"].append(pos_id)
+        else:  # We only have negatives, use two negs
+            pos_id = query["neg"].pop(0)  # Pop negative and add at end
+            pos_text = self.corpus[pos_id]
+            query["neg"].append(pos_id)
+
+        # Get a negative passage
+        neg_id = query["neg"].pop(0)  # Pop negative and add at end
+        neg_text = self.corpus[neg_id]
+        query["neg"].append(neg_id)
+
+        pos_score = self.ce_scores[qid][pos_id]
+        neg_score = self.ce_scores[qid][neg_id]
+
+        return InputExample(texts=[query_text, pos_text, neg_text], label=pos_score - neg_score)
+
+    def __len__(self):
+        return len(self.queries)
+
+
+# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
+train_dataset = MSMARCODataset(queries=train_queries, corpus=corpus, ce_scores=ce_scores)
+train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, drop_last=True)
+train_loss = losses.MarginMSELoss(model=model)
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=args.warmup_steps,
+    use_amp=True,
+    checkpoint_path=model_save_path,
+    checkpoint_save_steps=10000,
+    optimizer_params={"lr": args.lr},
+)
+
+# Train latest model
+model.save(model_save_path)
--- a/examples/training/ms_marco/train_bi-encoder_mnrl.py
+++ b/examples/training/ms_marco/train_bi-encoder_mnrl.py
+"""
+This examples show how to train a Bi-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).
+
+The queries and passages are passed independently to the transformer network to produce fixed sized embeddings.
+These embeddings can then be compared using cosine-similarity to find matching passages for a given query.
+
+For training, we use MultipleNegativesRankingLoss. There, we pass triplets in the format:
+(query, positive_passage, negative_passage)
+
+Negative passage are hard negative examples, that were mined using different dense embedding methods and lexical search methods.
+Each positive and negative passage comes with a score from a Cross-Encoder. This allows denoising, i.e. removing false negative
+passages that are actually relevant for the query.
+
+With a distilbert-base-uncased model, it should achieve a performance of about 33.79 MRR@10 on the MSMARCO Passages Dev-Corpus
+
+Running this script:
+python train_bi-encoder-v3.py
+"""
+
+import json
+from torch.utils.data import DataLoader
+from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
+import logging
+from datetime import datetime
+import gzip
+import os
+import tarfile
+import tqdm
+from torch.utils.data import Dataset
+import random
+import pickle
+import argparse
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--train_batch_size", default=64, type=int)
+parser.add_argument("--max_seq_length", default=300, type=int)
+parser.add_argument("--model_name", required=True)
+parser.add_argument("--max_passages", default=0, type=int)
+parser.add_argument("--epochs", default=10, type=int)
+parser.add_argument("--pooling", default="mean")
+parser.add_argument(
+    "--negs_to_use",
+    default=None,
+    help="From which systems should negatives be used? Multiple systems separated by comma. None = all",
+)
+parser.add_argument("--warmup_steps", default=1000, type=int)
+parser.add_argument("--lr", default=2e-5, type=float)
+parser.add_argument("--num_negs_per_system", default=5, type=int)
+parser.add_argument("--use_pre_trained_model", default=False, action="store_true")
+parser.add_argument("--use_all_queries", default=False, action="store_true")
+parser.add_argument("--ce_score_margin", default=3.0, type=float)
+args = parser.parse_args()
+
+print(args)
+
+# The  model we want to fine-tune
+model_name = args.model_name
+
+train_batch_size = (
+    args.train_batch_size
+)  # Increasing the train batch size improves the model performance, but requires more GPU memory
+max_seq_length = args.max_seq_length  # Max length for passages. Increasing it, requires more GPU memory
+ce_score_margin = args.ce_score_margin  # Margin for the CrossEncoder score between negative and positive passages
+num_negs_per_system = (
+    args.num_negs_per_system
+)  # We used different systems to mine hard negatives. Number of hard negatives to add from each system
+num_epochs = args.epochs  # Number of epochs we want to train
+
+# Load our embedding model
+if args.use_pre_trained_model:
+    logging.info("use pretrained SBERT model")
+    model = SentenceTransformer(model_name)
+    model.max_seq_length = max_seq_length
+else:
+    logging.info("Create new SBERT model")
+    word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), args.pooling)
+    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+model_save_path = "output/train_bi-encoder-mnrl-{}-margin_{:.1f}-{}".format(
+    model_name.replace("/", "-"), ce_score_margin, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+### Now we read the MS Marco dataset
+data_folder = "msmarco-data"
+
+#### Read the corpus files, that contain all the passages. Store them in the corpus dict
+corpus = {}  # dict in the format: passage_id -> passage. Stores all existent passages
+collection_filepath = os.path.join(data_folder, "collection.tsv")
+if not os.path.exists(collection_filepath):
+    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download collection.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+logging.info("Read corpus: collection.tsv")
+with open(collection_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        pid, passage = line.strip().split("\t")
+        pid = int(pid)
+        corpus[pid] = passage
+
+
+### Read the train queries, store in queries dict
+queries = {}  # dict in the format: query_id -> query. Stores all training queries
+queries_filepath = os.path.join(data_folder, "queries.train.tsv")
+if not os.path.exists(queries_filepath):
+    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download queries.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+
+with open(queries_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        qid = int(qid)
+        queries[qid] = query
+
+
+# Load a dict (qid, pid) -> ce_score that maps query-ids (qid) and paragraph-ids (pid)
+# to the CrossEncoder score computed by the cross-encoder/ms-marco-MiniLM-L-6-v2 model
+ce_scores_file = os.path.join(data_folder, "cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz")
+if not os.path.exists(ce_scores_file):
+    logging.info("Download cross-encoder scores file")
+    util.http_get(
+        "https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz",
+        ce_scores_file,
+    )
+
+logging.info("Load CrossEncoder scores dict")
+with gzip.open(ce_scores_file, "rb") as fIn:
+    ce_scores = pickle.load(fIn)
+
+# As training data we use hard-negatives that have been mined using various systems
+hard_negatives_filepath = os.path.join(data_folder, "msmarco-hard-negatives.jsonl.gz")
+if not os.path.exists(hard_negatives_filepath):
+    logging.info("Download cross-encoder scores file")
+    util.http_get(
+        "https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/msmarco-hard-negatives.jsonl.gz",
+        hard_negatives_filepath,
+    )
+
+
+logging.info("Read hard negatives train file")
+train_queries = {}
+negs_to_use = None
+with gzip.open(hard_negatives_filepath, "rt") as fIn:
+    for line in tqdm.tqdm(fIn):
+        data = json.loads(line)
+
+        # Get the positive passage ids
+        qid = data["qid"]
+        pos_pids = data["pos"]
+
+        if len(pos_pids) == 0:  # Skip entries without positives passages
+            continue
+
+        pos_min_ce_score = min([ce_scores[qid][pid] for pid in data["pos"]])
+        ce_score_threshold = pos_min_ce_score - ce_score_margin
+
+        # Get the hard negatives
+        neg_pids = set()
+        if negs_to_use is None:
+            if args.negs_to_use is not None:  # Use specific system for negatives
+                negs_to_use = args.negs_to_use.split(",")
+            else:  # Use all systems
+                negs_to_use = list(data["neg"].keys())
+            logging.info("Using negatives from the following systems: {}".format(", ".join(negs_to_use)))
+
+        for system_name in negs_to_use:
+            if system_name not in data["neg"]:
+                continue
+
+            system_negs = data["neg"][system_name]
+            negs_added = 0
+            for pid in system_negs:
+                if ce_scores[qid][pid] > ce_score_threshold:
+                    continue
+
+                if pid not in neg_pids:
+                    neg_pids.add(pid)
+                    negs_added += 1
+                    if negs_added >= num_negs_per_system:
+                        break
+
+        if args.use_all_queries or (len(pos_pids) > 0 and len(neg_pids) > 0):
+            train_queries[data["qid"]] = {
+                "qid": data["qid"],
+                "query": queries[data["qid"]],
+                "pos": pos_pids,
+                "neg": neg_pids,
+            }
+
+del ce_scores
+
+logging.info("Train queries: {}".format(len(train_queries)))
+
+
+# We create a custom MSMARCO dataset that returns triplets (query, positive, negative)
+# on-the-fly based on the information from the mined-hard-negatives jsonl file.
+class MSMARCODataset(Dataset):
+    def __init__(self, queries, corpus):
+        self.queries = queries
+        self.queries_ids = list(queries.keys())
+        self.corpus = corpus
+
+        for qid in self.queries:
+            self.queries[qid]["pos"] = list(self.queries[qid]["pos"])
+            self.queries[qid]["neg"] = list(self.queries[qid]["neg"])
+            random.shuffle(self.queries[qid]["neg"])
+
+    def __getitem__(self, item):
+        query = self.queries[self.queries_ids[item]]
+        query_text = query["query"]
+
+        pos_id = query["pos"].pop(0)  # Pop positive and add at end
+        pos_text = self.corpus[pos_id]
+        query["pos"].append(pos_id)
+
+        neg_id = query["neg"].pop(0)  # Pop negative and add at end
+        neg_text = self.corpus[neg_id]
+        query["neg"].append(neg_id)
+
+        return InputExample(texts=[query_text, pos_text, neg_text])
+
+    def __len__(self):
+        return len(self.queries)
+
+
+# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
+train_dataset = MSMARCODataset(train_queries, corpus=corpus)
+train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MultipleNegativesRankingLoss(model=model)
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    epochs=num_epochs,
+    warmup_steps=args.warmup_steps,
+    use_amp=True,
+    checkpoint_path=model_save_path,
+    checkpoint_save_steps=len(train_dataloader),
+    optimizer_params={"lr": args.lr},
+)
+
+# Save the model
+model.save(model_save_path)
--- a/examples/training/ms_marco/train_cross-encoder_kd.py
+++ b/examples/training/ms_marco/train_cross-encoder_kd.py
+"""
+This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).
+
+In this example we use a knowledge distillation setup. Sebastian Hofstätter et al. trained in https://arxiv.org/abs/2010.02666
+an ensemble of large Transformer models for the MS MARCO datasets and combines the scores from a BERT-base, BERT-large, and ALBERT-large model.
+
+We use the logits scores from the ensemble to train a smaller model. We found that the MiniLM model gives the best performance while
+offering the highest speed.
+
+The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
+for a given query, for example with Elasticsearch, and pass the query+retrieved_passage to the CrossEncoder
+for scoring. You sort the results then according to the output of the CrossEncoder.
+
+This gives a significant boost compared to out-of-the-box Elasticsearch / BM25 ranking.
+
+Running this script:
+python train_cross-encoder-v2.py
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
+from sentence_transformers import InputExample
+import logging
+from datetime import datetime
+import gzip
+import os
+import tarfile
+import torch
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# First, we define the transformer model we want to fine-tune
+model_name = "microsoft/MiniLM-L12-H384-uncased"
+train_batch_size = 32
+num_epochs = 1
+model_save_path = (
+    "output/training_ms-marco_cross-encoder-v2-"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# We set num_labels=1 and set the activation function to Identity, so that we get the raw logits
+model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity())
+
+
+### Now we read the MS Marco dataset
+data_folder = "msmarco-data"
+os.makedirs(data_folder, exist_ok=True)
+
+
+#### Read the corpus files, that contain all the passages. Store them in the corpus dict
+corpus = {}
+collection_filepath = os.path.join(data_folder, "collection.tsv")
+if not os.path.exists(collection_filepath):
+    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download collection.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+with open(collection_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        pid, passage = line.strip().split("\t")
+        corpus[pid] = passage
+
+
+### Read the train queries, store in queries dict
+queries = {}
+queries_filepath = os.path.join(data_folder, "queries.train.tsv")
+if not os.path.exists(queries_filepath):
+    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download queries.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+
+with open(queries_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        queries[qid] = query
+
+
+### Now we create our  dev data
+train_samples = []
+dev_samples = {}
+
+# We use 200 random queries from the train set for evaluation during training
+# Each query has at least one relevant and up to 200 irrelevant (negative) passages
+num_dev_queries = 200
+num_max_dev_negatives = 200
+
+# msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz and msmarco-qidpidtriples.rnd-shuf.train.tsv.gz is a randomly
+# shuffled version of qidpidtriples.train.full.2.tsv.gz from the MS Marco website
+# We extracted in the train-eval split 500 random queries that can be used for evaluation during training
+train_eval_filepath = os.path.join(data_folder, "msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz")
+if not os.path.exists(train_eval_filepath):
+    logging.info("Download " + os.path.basename(train_eval_filepath))
+    util.http_get("https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz", train_eval_filepath)
+
+with gzip.open(train_eval_filepath, "rt") as fIn:
+    for line in fIn:
+        qid, pos_id, neg_id = line.strip().split()
+
+        if qid not in dev_samples and len(dev_samples) < num_dev_queries:
+            dev_samples[qid] = {"query": queries[qid], "positive": set(), "negative": set()}
+
+        if qid in dev_samples:
+            dev_samples[qid]["positive"].add(corpus[pos_id])
+
+            if len(dev_samples[qid]["negative"]) < num_max_dev_negatives:
+                dev_samples[qid]["negative"].add(corpus[neg_id])
+
+dev_qids = set(dev_samples.keys())
+
+# Read our training file
+# As input examples, we provide the (query, passage) pair together with the logits score from the teacher ensemble
+teacher_logits_filepath = os.path.join(data_folder, "bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv")
+train_samples = []
+if not os.path.exists(teacher_logits_filepath):
+    util.http_get(
+        "https://zenodo.org/record/4068216/files/bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1",
+        teacher_logits_filepath,
+    )
+
+with open(teacher_logits_filepath) as fIn:
+    for line in fIn:
+        pos_score, neg_score, qid, pid1, pid2 = line.strip().split("\t")
+
+        if qid in dev_qids:  # Skip queries in our dev dataset
+            continue
+
+        train_samples.append(InputExample(texts=[queries[qid], corpus[pid1]], label=float(pos_score)))
+        train_samples.append(InputExample(texts=[queries[qid], corpus[pid2]], label=float(neg_score)))
+
+# We create a DataLoader to load our train samples
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
+
+# We add an evaluator, which evaluates the performance during training
+# It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision
+evaluator = CERerankingEvaluator(dev_samples, name="train-eval")
+
+# Configure the training
+warmup_steps = 5000
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_dataloader=train_dataloader,
+    loss_fct=torch.nn.MSELoss(),
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=5000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    optimizer_params={"lr": 7e-6},
+    use_amp=True,
+)
+
+# Save latest model
+model.save(model_save_path + "-latest")
--- a/examples/training/ms_marco/train_cross-encoder_scratch.py
+++ b/examples/training/ms_marco/train_cross-encoder_scratch.py
+"""
+This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).
+
+The query and the passage are passed simoultanously to a Transformer network. The network then returns
+a score between 0 and 1 how relevant the passage is for a given query.
+
+The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
+for a given query, for example with Elasticsearch, and pass the query+retrieved_passage to the CrossEncoder
+for scoring. You sort the results then according to the output of the CrossEncoder.
+
+This gives a significant boost compared to out-of-the-box Elasticsearch / BM25 ranking.
+
+Running this script:
+python train_cross-encoder.py
+"""
+
+from torch.utils.data import DataLoader
+from sentence_transformers import LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
+from sentence_transformers import InputExample
+import logging
+from datetime import datetime
+import gzip
+import os
+import tarfile
+import tqdm
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# First, we define the transformer model we want to fine-tune
+model_name = "distilroberta-base"
+train_batch_size = 32
+num_epochs = 1
+model_save_path = (
+    "output/training_ms-marco_cross-encoder-"
+    + model_name.replace("/", "-")
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# We train the network with as a binary label task
+# Given [query, passage] is the label 0 = irrelevant or 1 = relevant?
+# We use a positive-to-negative ratio: For 1 positive sample (label 1) we include 4 negative samples (label 0)
+# in our training setup. For the negative samples, we use the triplets provided by MS Marco that
+# specify (query, positive sample, negative sample).
+pos_neg_ration = 4
+
+# Maximal number of training samples we want to use
+max_train_samples = 2e7
+
+# We set num_labels=1, which predicts a continuous score between 0 and 1
+model = CrossEncoder(model_name, num_labels=1, max_length=512)
+
+
+### Now we read the MS Marco dataset
+data_folder = "msmarco-data"
+os.makedirs(data_folder, exist_ok=True)
+
+
+#### Read the corpus files, that contain all the passages. Store them in the corpus dict
+corpus = {}
+collection_filepath = os.path.join(data_folder, "collection.tsv")
+if not os.path.exists(collection_filepath):
+    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download collection.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+with open(collection_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        pid, passage = line.strip().split("\t")
+        corpus[pid] = passage
+
+
+### Read the train queries, store in queries dict
+queries = {}
+queries_filepath = os.path.join(data_folder, "queries.train.tsv")
+if not os.path.exists(queries_filepath):
+    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
+    if not os.path.exists(tar_filepath):
+        logging.info("Download queries.tar.gz")
+        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+
+    with tarfile.open(tar_filepath, "r:gz") as tar:
+        tar.extractall(path=data_folder)
+
+
+with open(queries_filepath, "r", encoding="utf8") as fIn:
+    for line in fIn:
+        qid, query = line.strip().split("\t")
+        queries[qid] = query
+
+
+### Now we create our training & dev data
+train_samples = []
+dev_samples = {}
+
+# We use 200 random queries from the train set for evaluation during training
+# Each query has at least one relevant and up to 200 irrelevant (negative) passages
+num_dev_queries = 200
+num_max_dev_negatives = 200
+
+# msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz and msmarco-qidpidtriples.rnd-shuf.train.tsv.gz is a randomly
+# shuffled version of qidpidtriples.train.full.2.tsv.gz from the MS Marco website
+# We extracted in the train-eval split 500 random queries that can be used for evaluation during training
+train_eval_filepath = os.path.join(data_folder, "msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz")
+if not os.path.exists(train_eval_filepath):
+    logging.info("Download " + os.path.basename(train_eval_filepath))
+    util.http_get("https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz", train_eval_filepath)
+
+with gzip.open(train_eval_filepath, "rt") as fIn:
+    for line in fIn:
+        qid, pos_id, neg_id = line.strip().split()
+
+        if qid not in dev_samples and len(dev_samples) < num_dev_queries:
+            dev_samples[qid] = {"query": queries[qid], "positive": set(), "negative": set()}
+
+        if qid in dev_samples:
+            dev_samples[qid]["positive"].add(corpus[pos_id])
+
+            if len(dev_samples[qid]["negative"]) < num_max_dev_negatives:
+                dev_samples[qid]["negative"].add(corpus[neg_id])
+
+
+# Read our training file
+train_filepath = os.path.join(data_folder, "msmarco-qidpidtriples.rnd-shuf.train.tsv.gz")
+if not os.path.exists(train_filepath):
+    logging.info("Download " + os.path.basename(train_filepath))
+    util.http_get("https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train.tsv.gz", train_filepath)
+
+cnt = 0
+with gzip.open(train_filepath, "rt") as fIn:
+    for line in tqdm.tqdm(fIn, unit_scale=True):
+        qid, pos_id, neg_id = line.strip().split()
+
+        if qid in dev_samples:
+            continue
+
+        query = queries[qid]
+        if (cnt % (pos_neg_ration + 1)) == 0:
+            passage = corpus[pos_id]
+            label = 1
+        else:
+            passage = corpus[neg_id]
+            label = 0
+
+        train_samples.append(InputExample(texts=[query, passage], label=label))
+        cnt += 1
+
+        if cnt >= max_train_samples:
+            break
+
+# We create a DataLoader to load our train samples
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+
+# We add an evaluator, which evaluates the performance during training
+# It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision
+evaluator = CERerankingEvaluator(dev_samples, name="train-eval")
+
+# Configure the training
+warmup_steps = 5000
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_dataloader=train_dataloader,
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=10000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=True,
+)
+
+# Save latest model
+model.save(model_save_path + "-latest")
--- a/examples/training/multilingual/README.md
+++ b/examples/training/multilingual/README.md
+# Multilingual-Models
+The issue with multilingual BERT (mBERT) as well as with XLM-RoBERTa is that those produce rather bad sentence representation out-of-the-box. Further, the vectors spaces between languages are not  aligned, i.e., the sentences with the same content in different languages would be mapped to different locations in the vector space.
+
+In my publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) I describe any easy approach to extend sentence embeddings to further languages.
+
+Chien Vu also wrote a nice blog article on this technique: [A complete guide to transfer learning from English to other Languages using Sentence Embeddings BERT Models](https://towardsdatascience.com/a-complete-guide-to-transfer-learning-from-english-to-other-languages-using-sentence-embeddings-8c427f8804a9)
+
+## Available Pre-trained Models
+For a list of available models, see [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).
+
+
+## Usage
+You can use the models in the following way:
+```python
+from sentence_transformers import SentenceTransformer
+
+embedder = SentenceTransformer("model-name")
+embeddings = embedder.encode(["Hello World", "Hallo Welt", "Hola mundo"])
+print(embeddings)
+```
+
+
+## Performance
+The performance was evaluated on the [Semantic Textual Similarity (STS) 2017 dataset](http://ixa2.si.ehu.es/stswiki/index.php/Main_Page). The task is to predict the semantic similarity (on a scale 0-5) of two given sentences. STS2017 has monolingual test data for English, Arabic, and Spanish, and cross-lingual test data for English-Arabic, -Spanish and -Turkish.
+
+We extended the STS2017 and added cross-lingual test data for English-German, French-English, Italian-English, and Dutch-English ([STS2017-extended.zip](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/STS2017-extended.zip)). The performance is measured using Spearman correlation between the predicted similarity score and the gold score.
+
+<table class="docutils">
+  <tr>
+    <th>Model</th>
+    <th>AR-AR</th>
+    <th>AR-EN</th>
+    <th>ES-ES</th>
+    <th>ES-EN</th>
+    <th>EN-EN</th>
+    <th>TR-EN</th>
+    <th>EN-DE</th>
+    <th>FR-EN</th>
+    <th>IT-EN</th>
+    <th>NL-EN</th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td>XLM-RoBERTa mean pooling </td>
+    <td align="center">25.7</td>
+    <td align="center">17.4</td>
+    <td align="center">51.8</td>
+    <td align="center">10.9</td>
+    <td align="center">50.7</td>
+    <td align="center">9.2</td>
+    <td align="center">21.3</td>
+    <td align="center">16.6</td>
+    <td align="center">22.9</td>
+    <td align="center">26.0</td>
+    <td align="center">25.2</td>
+  </tr>
+  <tr>
+    <td>mBERT mean pooling </td>
+    <td align="center">50.9</td>
+    <td align="center">16.7</td>
+    <td align="center">56.7</td>
+    <td align="center">21.5</td>
+    <td align="center">54.4</td>
+    <td align="center">16.0</td>
+    <td align="center">33.9</td>
+    <td align="center">33.0</td>
+    <td align="center">34.0</td>
+    <td align="center">35.6</td>
+    <td align="center">35.3</td>
+  </tr>
+  <tr>
+    <td>LASER</td>
+    <td align="center">68.9</td>
+    <td align="center">66.5</td>
+    <td align="center">79.7</td>
+    <td align="center">57.9</td>
+    <td align="center">77.6</td>
+    <td align="center">72.0</td>
+    <td align="center">64.2</td>
+    <td align="center">69.1</td>
+    <td align="center">70.8</td>
+    <td align="center">68.5</td>
+    <td align="center">69.5</td>
+  </tr> 
+  <tr>
+    <td colspan="12"><b>Sentence Transformer Models</b></td>
+  </tr>
+  <tr>
+  <td>distiluse-base-multilingual-cased</td>
+    <td align="center">75.9</td>
+    <td align="center">77.6</td>
+    <td align="center">85.3</td>
+    <td align="center">78.7</td>
+    <td align="center">85.4</td>
+    <td align="center">75.5</td>
+    <td align="center">80.3</td>
+    <td align="center">80.2</td>
+    <td align="center">80.5</td>
+    <td align="center">81.7</td>
+    <td align="center">80.1</td>
+    </tr>
+</table>
+
+
+## Extend your own models
+![Multilingual Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/multilingual-distillation.png)
+
+The idea is based on a fixed (monolingual) **teacher model**, that produces sentence embeddings with our desired properties in one language. The **student model** is supposed to mimic the teacher model, i.e., the same English sentence should be mapped to the same vector by the teacher and by the student model. In order that the student model works for further languages, we train the student model on parallel (translated) sentences. The translation of each sentence should also be mapped to the same vector as the original sentence.
+
+In the above figure, the student model should map *Hello World* and the German translation *Hallo Welt* to the vector of *teacher_model('Hello World')*. We achieve this by training the student model using mean squared error (MSE) loss.
+
+In our experiments we initialized the student model with the multilingual XLM-RoBERTa model. 
+
+## Training 
+For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py). 
+
+This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages.
+
+
+
+## Data Format
+
+As training data we require parallel sentences, i.e., sentences translated in various languages. As data format, we use a tab-separated .tsv file. In the first column, you have your source sentence, for example, an English sentence. In the following columns, you have the translations of this source sentence. If you have multiple translations per source sentence, you can put them in the same line or in different lines.
+```
+Source_sentence Target_lang1    Target_lang2    Target_lang3
+Source_sentence Target_lang1    Target_lang2
+```
+
+An example file could look like this (EN DE ES):
+```
+Hello World Hallo Welt  Hola Mundo
+Sentences are separated with a tab character.    Die Sätze sind per Tab getrennt.    Las oraciones se separan con un carácter de tabulación.
+```
+
+The order of the translations are not important, it is only important that the first column contains a sentence in a language that is understood by the teacher model.
+
+## Loading Training Datasets
+
+You can load such a training file using the *ParallelSentencesDataset* class:
+```python
+from sentence_transformers.datasets import ParallelSentencesDataset
+
+train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model)
+train_data.load_data("path/to/tab/separated/train-en-de.tsv")
+train_data.load_data("path/to/tab/separated/train-en-es.tsv.gz")
+train_data.load_data("path/to/tab/separated/train-en-fr.tsv.gz")
+
+train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MSELoss(model=student_model)
+```
+
+You load a file with the *load_data()* method. You can load multiple files by calling load_data multiple times. You can also regular files or .gz-compressed files.
+
+Per default, all datasets are weighted equally. In the above example a (source, translation)-pair will be sampled equally from all three datasets. If you pass a `weight` parameter (integer), you can weight some datasets higher or lower.
+
+## Sources for Training Data
+A great website for a vast number of parallel (translated) datasets is [OPUS](http://opus.nlpl.eu/). There, you find parallel datasets for more than 400 languages. 
+
+The [examples/training/multilingual](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/) folder contains some scripts that downloads parallel training data and brings it into the right format:
+- [get_parallel_data_opus.py](get_parallel_data_opus.py): This script downloads data from the [OPUS](http://opus.nlpl.eu/) website.
+- [get_parallel_data_tatoeba.py](get_parallel_data_tatoeba.py): This script downloads data from the [Tatoeba](https://tatoeba.org/) website, a website for language learners with example sentences for more than many languages.
+- [get_parallel_data_talks.py](get_parallel_data_talks.py): This script downloads data the parallel sentences corpus, which contains transcripts and translations of more than 4,000 talks in 100+ languages.
+
+## Evaluation
+
+Training can be evaluated in different ways. For an example how to use these evaluation methods, see [make_multilingual.py](make_multilingual.py). 
+
+### MSE Evaluation
+You can measure the mean squared error (MSE) between the student embeddings and teacher embeddings. This can be achieved with the ``
+
+```python
+# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
+dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model)
+```
+
+This evaluator computes the teacher embeddings for the `src_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `trg_sentences`, for example, for Spanish. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance.
+
+### Translation Accuracy
+You can also measure the translation accuracy. Given a list with source sentences, for example, 1000 English sentences. And a list with matching target (translated) sentences, for example, 1000 Spanish sentences.
+
+For each sentence pair, we check if their embeddings are the closest using cosine similarity. I.e., for each `src_sentences[i]` we check if `trg_sentences[i]` has the highest similarity out of all target sentences. If this is the case, we have a hit, otherwise an error. This evaluator reports accuracy (higher = better). 
+
+```python
+# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
+dev_trans_acc = evaluation.TranslationEvaluator(
+    src_sentences,
+    trg_sentences,
+    name=os.path.basename(dev_file),
+    batch_size=inference_batch_size,
+)
+```
+
+### Multi-Lingual Semantic Textual Similarity
+You can also measure the semantic textual similarity (STS) between sentence pairs in different languages:
+
+```python
+sts_evaluator = evaluation.EmbeddingSimilarityEvaluatorFromList(sentences1, sentences2, scores)
+```
+
+Where `sentences1` and `sentences2` are lists of sentences and score is numeric value indicating the semantic similarity between `sentences1[i]` and `sentences2[i]`.
+
+
+## Citation
+If you use the code for multilingual models, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813):
+``` 
+@article{reimers-2020-multilingual-sentence-bert,
+    title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    journal= "arXiv preprint arXiv:2004.09813",
+    month = "04",
+    year = "2020",
+    url = "http://arxiv.org/abs/2004.09813",
+}
+```
--- a/examples/training/multilingual/get_parallel_data_opus.py
+++ b/examples/training/multilingual/get_parallel_data_opus.py
+"""
+OPUS (http://opus.nlpl.eu/) is a great collection of different parallel datasets for more than 400 languages.
+On the website, you can download parallel datasets for many languages in different formats. I found that
+the format "Bottom-left triangle: download plain text files (MOSES/GIZA++)"  requires minimal
+overhead for post-processing to get it into a suitable format for this library.
+
+You can use the OPUS dataset to create multilingual sentence embeddings. This script contains code to download
+OPUS datasets for the desired languages and to create training files in the right format.
+
+1) First, you need to install OpusTools (https://github.com/Helsinki-NLP/OpusTools/tree/master/opustools_pkg):
+pip install opustools
+
+2) Once you have OpusTools installed, you can download data in the right format via:
+mkdir parallel-sentences
+opus_read -d [CORPUS] -s [SRC_LANG] -t [TRG_LANG] --write parallel-sentences/[FILENAME].tsv.gz   -wm moses -dl opus -p raw
+
+For example:
+mkdir parallel-sentences
+opus_read -d JW300 -s en -t de --write parallel-sentences/JW300-en-de.tsv.gz -wm moses -dl opus -p raw
+
+This downloads the JW300 Corpus (http://opus.nlpl.eu/JW300.php) for English (en) and German (de) and write the output to
+parallel-sentences/JW300-en-de.tsv.gz
+
+
+####################
+
+This python code automates the download and creation of the parallel sentences files.
+
+
+"""
+
+from opustools import OpusRead
+import os
+
+
+corpora = ["JW300"]  # Corpora you want to use
+source_languages = ["en"]  # Source language, our teacher model is able to understand
+target_languages = ["de", "es", "it", "fr", "ar", "tr"]  # Target languages, out student model should learn
+
+output_folder = "parallel-sentences"
+opus_download_folder = "./opus"
+
+# Iterator over all corpora / source languages / target languages combinations and download files
+os.makedirs(output_folder, exist_ok=True)
+
+for corpus in corpora:
+    for src_lang in source_languages:
+        for trg_lang in target_languages:
+            output_filename = os.path.join(output_folder, "{}-{}-{}.tsv.gz".format(corpus, src_lang, trg_lang))
+            if not os.path.exists(output_filename):
+                print("Create:", output_filename)
+                try:
+                    read = OpusRead(
+                        directory=corpus,
+                        source=src_lang,
+                        target=trg_lang,
+                        write=[output_filename],
+                        download_dir=opus_download_folder,
+                        preprocess="raw",
+                        write_mode="moses",
+                        suppress_prompts=True,
+                    )
+                    read.printPairs()
+                except Exception:
+                    print("An error occurred during the creation of", output_filename)
--- a/examples/training/multilingual/get_parallel_data_talks.py
+++ b/examples/training/multilingual/get_parallel_data_talks.py
+"""
+This script downloads the parallel sentences corpus and create parallel sentences tsv files that can be used to extend
+existent sentence embedding models to new languages.
+
+The parallel sentences corpus is a crawl of transcripts from talks, which are translated to 100+ languages.
+
+The parallel sentences corpus cannot be downloaded automatically. It is available for research purposes only (CC-BY-NC).
+
+The training procedure can be found in the files make_multilingual.py and make_multilingual_sys.py.
+
+Further information can be found in our paper:
+Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
+https://arxiv.org/abs/2004.09813
+"""
+
+import os
+import sentence_transformers.util
+import gzip
+import csv
+from tqdm.autonotebook import tqdm
+
+
+source_languages = set(["en"])  # Languages our (monolingual) teacher model understands
+target_languages = set(["de", "es", "it", "fr", "ar", "tr"])  # New languages we want to extend to
+
+
+dev_sentences = 1000  # Number of sentences we want to use for development
+download_url = "https://sbert.net/datasets/parallel-sentences.tsv.gz"  # Specify parallel sentences URL here
+parallel_sentences_path = "../datasets/parallel-sentences.tsv.gz"  # Path of the parallel-sentences.tsv.gz file.
+parallel_sentences_folder = "parallel-sentences/"
+
+
+os.makedirs(os.path.dirname(parallel_sentences_path), exist_ok=True)
+if not os.path.exists(parallel_sentences_path):
+    print("parallel-sentences.tsv.gz does not exists. Try to download from server")
+    sentence_transformers.util.http_get(download_url, parallel_sentences_path)
+
+
+os.makedirs(parallel_sentences_folder, exist_ok=True)
+train_files = []
+dev_files = []
+files_to_create = []
+for source_lang in source_languages:
+    for target_lang in target_languages:
+        output_filename_train = os.path.join(
+            parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
+        )
+        output_filename_dev = os.path.join(
+            parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
+        )
+        train_files.append(output_filename_train)
+        dev_files.append(output_filename_dev)
+        if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
+            files_to_create.append(
+                {
+                    "src_lang": source_lang,
+                    "trg_lang": target_lang,
+                    "fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
+                    "fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
+                    "devCount": 0,
+                }
+            )
+
+if len(files_to_create) > 0:
+    print(
+        "Parallel sentences files {} do not exist. Create these files now".format(
+            ", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
+        )
+    )
+    with gzip.open(parallel_sentences_path, "rt", encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+        for line in tqdm(reader, desc="Sentences"):
+            for outfile in files_to_create:
+                src_text = line[outfile["src_lang"]].strip()
+                trg_text = line[outfile["trg_lang"]].strip()
+
+                if src_text != "" and trg_text != "":
+                    if outfile["devCount"] < dev_sentences:
+                        outfile["devCount"] += 1
+                        fOut = outfile["fDev"]
+                    else:
+                        fOut = outfile["fTrain"]
+
+                    fOut.write("{}\t{}\n".format(src_text, trg_text))
+
+    for outfile in files_to_create:
+        outfile["fTrain"].close()
+        outfile["fDev"].close()
+
+
+print("---DONE---")
--- a/examples/training/multilingual/get_parallel_data_tatoeba.py
+++ b/examples/training/multilingual/get_parallel_data_tatoeba.py
+"""
+Tatoeba (https://tatoeba.org/) is a collection of sentences and translation, mainly aiming for language learning.
+It is available for more than 300 languages.
+
+This script downloads the Tatoeba corpus and extracts the sentences & translations in the languages you like
+"""
+
+import os
+import sentence_transformers
+import tarfile
+import gzip
+
+# Note: Tatoeba uses 3 letter languages codes (ISO-639-2),
+# while other datasets like OPUS use 2 letter language codes (ISO-639-1)
+# For training of sentence transformers, which type of language code is used doesn't matter.
+# For language codes, see: https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes
+source_languages = set(["eng"])
+target_languages = set(["deu", "ara", "tur", "spa", "ita", "fra"])
+
+num_dev_sentences = 1000  # Number of sentences that are used to create a development set
+
+
+tatoeba_folder = "../datasets/tatoeba"
+output_folder = "parallel-sentences/"
+
+
+sentences_file_bz2 = os.path.join(tatoeba_folder, "sentences.tar.bz2")
+sentences_file = os.path.join(tatoeba_folder, "sentences.csv")
+links_file_bz2 = os.path.join(tatoeba_folder, "links.tar.bz2")
+links_file = os.path.join(tatoeba_folder, "links.csv")
+
+download_url = "https://downloads.tatoeba.org/exports/"
+
+
+os.makedirs(tatoeba_folder, exist_ok=True)
+os.makedirs(output_folder, exist_ok=True)
+
+# Download files if needed
+for filepath in [sentences_file_bz2, links_file_bz2]:
+    if not os.path.exists(filepath):
+        url = download_url + os.path.basename(filepath)
+        print("Download", url)
+        sentence_transformers.util.http_get(url, filepath)
+
+# Extract files if needed
+if not os.path.exists(sentences_file):
+    print("Extract", sentences_file_bz2)
+    tar = tarfile.open(sentences_file_bz2, "r:bz2")
+    tar.extract("sentences.csv", path=tatoeba_folder)
+    tar.close()
+
+if not os.path.exists(links_file):
+    print("Extract", links_file_bz2)
+    tar = tarfile.open(links_file_bz2, "r:bz2")
+    tar.extract("links.csv", path=tatoeba_folder)
+    tar.close()
+
+
+# Read sentences
+sentences = {}
+all_langs = target_languages.union(source_languages)
+print("Read sentences.csv file")
+with open(sentences_file, encoding="utf8") as fIn:
+    for line in fIn:
+        id, lang, sentence = line.strip().split("\t")
+        if lang in all_langs:
+            sentences[id] = (lang, sentence)
+
+# Read links that map the translations between different languages
+print("Read links.csv")
+translations = {src_lang: {trg_lang: {} for trg_lang in target_languages} for src_lang in source_languages}
+with open(links_file, encoding="utf8") as fIn:
+    for line in fIn:
+        src_id, target_id = line.strip().split()
+
+        if src_id in sentences and target_id in sentences:
+            src_lang, src_sent = sentences[src_id]
+            trg_lang, trg_sent = sentences[target_id]
+
+            if src_lang in source_languages and trg_lang in target_languages:
+                if src_sent not in translations[src_lang][trg_lang]:
+                    translations[src_lang][trg_lang][src_sent] = []
+                translations[src_lang][trg_lang][src_sent].append(trg_sent)
+
+# Write everything to the output folder
+print("Write output files")
+for src_lang in source_languages:
+    for trg_lang in target_languages:
+        source_sentences = list(translations[src_lang][trg_lang])
+        train_sentences = source_sentences[num_dev_sentences:]
+        dev_sentences = source_sentences[0:num_dev_sentences]
+
+        print("{}-{} has {} sentences".format(src_lang, trg_lang, len(source_sentences)))
+        if len(dev_sentences) > 0:
+            with gzip.open(
+                os.path.join(output_folder, "Tatoeba-{}-{}-dev.tsv.gz".format(src_lang, trg_lang)),
+                "wt",
+                encoding="utf8",
+            ) as fOut:
+                for sent in dev_sentences:
+                    fOut.write("\t".join([sent] + translations[src_lang][trg_lang][sent]))
+                    fOut.write("\n")
+
+        if len(train_sentences) > 0:
+            with gzip.open(
+                os.path.join(output_folder, "Tatoeba-{}-{}-train.tsv.gz".format(src_lang, trg_lang)),
+                "wt",
+                encoding="utf8",
+            ) as fOut:
+                for sent in train_sentences:
+                    fOut.write("\t".join([sent] + translations[src_lang][trg_lang][sent]))
+                    fOut.write("\n")
+
+
+print("---DONE---")
--- a/examples/training/multilingual/get_parallel_data_wikimatrix.py
+++ b/examples/training/multilingual/get_parallel_data_wikimatrix.py
+"""
+This script downloads the WikiMatrix corpus (https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix)
+ and create parallel sentences tsv files that can be used to extend existent sentence embedding models to new languages.
+
+The WikiMatrix mined parallel sentences from Wikipedia in various languages.
+
+Further information can be found in our paper:
+Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
+https://arxiv.org/abs/2004.09813
+"""
+
+import os
+import sentence_transformers.util
+import gzip
+
+
+source_languages = set(["en"])  # Languages our (monolingual) teacher model understands
+target_languages = set(["de", "es", "it", "fr", "ar", "tr"])  # New languages we want to extend to
+
+
+num_dev_sentences = 1000  # Number of sentences we want to use for development
+threshold = 1.075  # Only use sentences with a LASER similarity score above the threshold
+
+download_url = "https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/"
+download_folder = "../datasets/WikiMatrix/"
+parallel_sentences_folder = "parallel-sentences/"
+
+
+os.makedirs(os.path.dirname(download_folder), exist_ok=True)
+os.makedirs(parallel_sentences_folder, exist_ok=True)
+
+
+for source_lang in source_languages:
+    for target_lang in target_languages:
+        filename_train = os.path.join(
+            parallel_sentences_folder, "WikiMatrix-{}-{}-train.tsv.gz".format(source_lang, target_lang)
+        )
+        filename_dev = os.path.join(
+            parallel_sentences_folder, "WikiMatrix-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
+        )
+
+        if not os.path.exists(filename_train) and not os.path.exists(filename_dev):
+            langs_ordered = sorted([source_lang, target_lang])
+            wikimatrix_filename = "WikiMatrix.{}-{}.tsv.gz".format(*langs_ordered)
+            wikimatrix_filepath = os.path.join(download_folder, wikimatrix_filename)
+
+            if not os.path.exists(wikimatrix_filepath):
+                print("Download", download_url + wikimatrix_filename)
+                try:
+                    sentence_transformers.util.http_get(download_url + wikimatrix_filename, wikimatrix_filepath)
+                except Exception:
+                    print("Was not able to download", download_url + wikimatrix_filename)
+                    continue
+
+            if not os.path.exists(wikimatrix_filepath):
+                continue
+
+            train_sentences = []
+            dev_sentences = []
+            dev_sentences_set = set()
+            extract_dev_sentences = True
+
+            with gzip.open(wikimatrix_filepath, "rt", encoding="utf8") as fIn:
+                for line in fIn:
+                    score, sent1, sent2 = line.strip().split("\t")
+                    sent1 = sent1.strip()
+                    sent2 = sent2.strip()
+                    score = float(score)
+
+                    if score < threshold:
+                        break
+
+                    if sent1 == sent2:
+                        continue
+
+                    if langs_ordered.index(source_lang) == 1:  # Swap, so that src lang is sent1
+                        sent1, sent2 = sent2, sent1
+
+                    # Avoid duplicates in development set
+                    if sent1 in dev_sentences_set or sent2 in dev_sentences_set:
+                        continue
+
+                    if extract_dev_sentences:
+                        dev_sentences.append([sent1, sent2])
+                        dev_sentences_set.add(sent1)
+                        dev_sentences_set.add(sent2)
+
+                        if len(dev_sentences) >= num_dev_sentences:
+                            extract_dev_sentences = False
+                    else:
+                        train_sentences.append([sent1, sent2])
+
+            print("Write", len(dev_sentences), "dev sentences", filename_dev)
+            with gzip.open(filename_dev, "wt", encoding="utf8") as fOut:
+                for sents in dev_sentences:
+                    fOut.write("\t".join(sents))
+                    fOut.write("\n")
+
+            print("Write", len(train_sentences), "train sentences", filename_train)
+            with gzip.open(filename_train, "wt", encoding="utf8") as fOut:
+                for sents in train_sentences:
+                    fOut.write("\t".join(sents))
+                    fOut.write("\n")
+
+
+print("---DONE---")
--- a/examples/training/multilingual/make_multilingual.py
+++ b/examples/training/multilingual/make_multilingual.py
+"""
+This script contains an example how to extend an existent sentence embedding model to new languages.
+
+Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
+variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
+on multiple languages.
+
+For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
+with the first column a sentence in a language understood by the teacher model, e.g. English,
+and the further columns contain the according translations for languages you want to extend to.
+
+This scripts downloads automatically the parallel sentences corpus. This corpus contains transcripts from
+talks translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts
+
+Further information can be found in our paper:
+Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
+https://arxiv.org/abs/2004.09813
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
+from torch.utils.data import DataLoader
+from sentence_transformers.datasets import ParallelSentencesDataset
+from datetime import datetime
+
+import os
+import logging
+import sentence_transformers.util
+import csv
+import gzip
+from tqdm.autonotebook import tqdm
+import numpy as np
+import zipfile
+import io
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+
+
+teacher_model_name = (
+    "paraphrase-distilroberta-base-v2"  # Our monolingual teacher model, we want to convert to multiple languages
+)
+student_model_name = "xlm-roberta-base"  # Multilingual base model we use to imitate the teacher model
+
+max_seq_length = 128  # Student model max. lengths for inputs (number of word pieces)
+train_batch_size = 64  # Batch size for training
+inference_batch_size = 64  # Batch size at inference
+max_sentences_per_language = 500000  # Maximum number of  parallel sentences for training
+train_max_sentence_length = 250  # Maximum length (characters) for parallel training sentences
+
+num_epochs = 5  # Train for x epochs
+num_warmup_steps = 10000  # Warumup steps
+
+num_evaluation_steps = 1000  # Evaluate performance after every xxxx steps
+dev_sentences = 1000  # Number of parallel sentences to be used for development
+
+
+# Define the language codes you would like to extend the model to
+source_languages = set(["en"])  # Our teacher model accepts English (en) sentences
+target_languages = set(
+    ["de", "es", "it", "fr", "ar", "tr"]
+)  # We want to extend the model to these new languages. For language codes, see the header of the train file
+
+
+output_path = (
+    "output/make-multilingual-"
+    + "-".join(sorted(list(source_languages)) + sorted(list(target_languages)))
+    + "-"
+    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# This function downloads a corpus if it does not exist
+def download_corpora(filepaths):
+    if not isinstance(filepaths, list):
+        filepaths = [filepaths]
+
+    for filepath in filepaths:
+        if not os.path.exists(filepath):
+            print(filepath, "does not exists. Try to download from server")
+            filename = os.path.basename(filepath)
+            url = "https://sbert.net/datasets/" + filename
+            sentence_transformers.util.http_get(url, filepath)
+
+
+# Here we define train train and dev corpora
+train_corpus = "datasets/parallel-sentences.tsv.gz"
+sts_corpus = "datasets/stsbenchmark.zip"
+parallel_sentences_folder = "parallel-sentences/"
+
+# Check if the file exists. If not, they are downloaded
+download_corpora([train_corpus, sts_corpus])
+
+
+# Create parallel files for the selected language combinations
+os.makedirs(parallel_sentences_folder, exist_ok=True)
+train_files = []
+dev_files = []
+files_to_create = []
+for source_lang in source_languages:
+    for target_lang in target_languages:
+        output_filename_train = os.path.join(
+            parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
+        )
+        output_filename_dev = os.path.join(
+            parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
+        )
+        train_files.append(output_filename_train)
+        dev_files.append(output_filename_dev)
+        if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
+            files_to_create.append(
+                {
+                    "src_lang": source_lang,
+                    "trg_lang": target_lang,
+                    "fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
+                    "fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
+                    "devCount": 0,
+                }
+            )
+
+if len(files_to_create) > 0:
+    print(
+        "Parallel sentences files {} do not exist. Create these files now".format(
+            ", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
+        )
+    )
+    with gzip.open(train_corpus, "rt", encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+        for line in tqdm(reader, desc="Sentences"):
+            for outfile in files_to_create:
+                src_text = line[outfile["src_lang"]].strip()
+                trg_text = line[outfile["trg_lang"]].strip()
+
+                if src_text != "" and trg_text != "":
+                    if outfile["devCount"] < dev_sentences:
+                        outfile["devCount"] += 1
+                        fOut = outfile["fDev"]
+                    else:
+                        fOut = outfile["fTrain"]
+
+                    fOut.write("{}\t{}\n".format(src_text, trg_text))
+
+    for outfile in files_to_create:
+        outfile["fTrain"].close()
+        outfile["fDev"].close()
+
+
+######## Start the extension of the teacher model to multiple languages ########
+logger.info("Load teacher model")
+teacher_model = SentenceTransformer(teacher_model_name)
+
+
+logger.info("Create student model from scratch")
+word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+###### Read Parallel Sentences Dataset ######
+train_data = ParallelSentencesDataset(
+    student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
+)
+for train_file in train_files:
+    train_data.load_data(
+        train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length
+    )
+
+train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MSELoss(model=student_model)
+
+
+#### Evaluate cross-lingual performance on different tasks #####
+evaluators = []  # evaluators has a list of different evaluator classes we call periodically
+
+for dev_file in dev_files:
+    logger.info("Create evaluator for " + dev_file)
+    src_sentences = []
+    trg_sentences = []
+    with gzip.open(dev_file, "rt", encoding="utf8") as fIn:
+        for line in fIn:
+            splits = line.strip().split("\t")
+            if splits[0] != "" and splits[1] != "":
+                src_sentences.append(splits[0])
+                trg_sentences.append(splits[1])
+
+    # Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
+    dev_mse = evaluation.MSEEvaluator(
+        src_sentences,
+        trg_sentences,
+        name=os.path.basename(dev_file),
+        teacher_model=teacher_model,
+        batch_size=inference_batch_size,
+    )
+    evaluators.append(dev_mse)
+
+    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
+    dev_trans_acc = evaluation.TranslationEvaluator(
+        src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
+    )
+    evaluators.append(dev_trans_acc)
+
+
+##### Read cross-lingual Semantic Textual Similarity (STS) data ####
+all_languages = list(set(list(source_languages) + list(target_languages)))
+sts_data = {}
+
+# Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
+with zipfile.ZipFile(sts_corpus) as zip:
+    filelist = zip.namelist()
+    sts_files = []
+
+    for i in range(len(all_languages)):
+        for j in range(i, len(all_languages)):
+            lang1 = all_languages[i]
+            lang2 = all_languages[j]
+            filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
+            if filepath not in filelist:
+                lang1, lang2 = lang2, lang1
+                filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
+
+            if filepath in filelist:
+                filename = os.path.basename(filepath)
+                sts_data[filename] = {"sentences1": [], "sentences2": [], "scores": []}
+
+                fIn = zip.open(filepath)
+                for line in io.TextIOWrapper(fIn, "utf8"):
+                    sent1, sent2, score = line.strip().split("\t")
+                    score = float(score)
+                    sts_data[filename]["sentences1"].append(sent1)
+                    sts_data[filename]["sentences2"].append(sent2)
+                    sts_data[filename]["scores"].append(score)
+
+for filename, data in sts_data.items():
+    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
+        data["sentences1"],
+        data["sentences2"],
+        data["scores"],
+        batch_size=inference_batch_size,
+        name=filename,
+        show_progress_bar=False,
+    )
+    evaluators.append(test_evaluator)
+
+
+# Train the model
+student_model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
+    epochs=num_epochs,
+    warmup_steps=num_warmup_steps,
+    evaluation_steps=num_evaluation_steps,
+    output_path=output_path,
+    save_best_model=True,
+    optimizer_params={"lr": 2e-5, "eps": 1e-6},
+)
--- a/examples/training/multilingual/make_multilingual_sys.py
+++ b/examples/training/multilingual/make_multilingual_sys.py
+"""
+This script contains an example how to extend an existent sentence embedding model to new languages.
+
+Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
+variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
+on multiple languages.
+
+For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
+with the first column a sentence in a language understood by the teacher model, e.g. English,
+and the further columns contain the according translations for languages you want to extend to.
+
+See get_parallel_data_[opus/tatoeba/talks].py for automatic download of parallel sentences datasets.
+
+Note: See make_multilingual.py for a fully automated script that downloads the necessary data and trains the model. This script just trains the model if you have already parallel data in the right format.
+
+
+Further information can be found in our paper:
+Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
+https://arxiv.org/abs/2004.09813
+
+
+Usage:
+python make_multilingual_sys.py train1.tsv.gz train2.tsv.gz train3.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz
+
+For example:
+python make_multilingual_sys.py parallel-sentences/talks-en-de-train.tsv.gz --dev parallel-sentences/talks-en-de-dev.tsv.gz
+
+To load all training & dev files from a folder (Linux):
+python make_multilingual_sys.py parallel-sentences/*-train.tsv.gz --dev parallel-sentences/*-dev.tsv.gz
+
+
+
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
+from torch.utils.data import DataLoader
+from sentence_transformers.datasets import ParallelSentencesDataset
+from datetime import datetime
+
+import os
+import logging
+import gzip
+import numpy as np
+import sys
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+logger = logging.getLogger(__name__)
+
+
+teacher_model_name = (
+    "paraphrase-distilroberta-base-v2"  # Our monolingual teacher model, we want to convert to multiple languages
+)
+student_model_name = "xlm-roberta-base"  # Multilingual base model we use to imitate the teacher model
+
+max_seq_length = 128  # Student model max. lengths for inputs (number of word pieces)
+train_batch_size = 64  # Batch size for training
+inference_batch_size = 64  # Batch size at inference
+max_sentences_per_trainfile = 500000  # Maximum number of  parallel sentences for training
+train_max_sentence_length = 250  # Maximum length (characters) for parallel training sentences
+
+num_epochs = 5  # Train for x epochs
+num_warmup_steps = 10000  # Warumup steps
+
+num_evaluation_steps = 1000  # Evaluate performance after every xxxx steps
+
+
+output_path = "output/make-multilingual-sys-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# Read passed arguments
+
+
+train_files = []
+dev_files = []
+is_dev_file = False
+for arg in sys.argv[1:]:
+    if arg.lower() == "--dev":
+        is_dev_file = True
+    else:
+        if not os.path.exists(arg):
+            print("File could not be found:", arg)
+            exit()
+
+        if is_dev_file:
+            dev_files.append(arg)
+        else:
+            train_files.append(arg)
+
+if len(train_files) == 0:
+    print("Please pass at least some train files")
+    print("python make_multilingual_sys.py file1.tsv.gz file2.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz")
+    exit()
+
+
+logger.info("Train files: {}".format(", ".join(train_files)))
+logger.info("Dev files: {}".format(", ".join(dev_files)))
+
+######## Start the extension of the teacher model to multiple languages ########
+logger.info("Load teacher model")
+teacher_model = SentenceTransformer(teacher_model_name)
+
+
+logger.info("Create student model from scratch")
+word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+
+###### Read Parallel Sentences Dataset ######
+train_data = ParallelSentencesDataset(
+    student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
+)
+for train_file in train_files:
+    train_data.load_data(
+        train_file, max_sentences=max_sentences_per_trainfile, max_sentence_length=train_max_sentence_length
+    )
+
+train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.MSELoss(model=student_model)
+
+
+#### Evaluate cross-lingual performance on different tasks #####
+evaluators = []  # evaluators has a list of different evaluator classes we call periodically
+
+for dev_file in dev_files:
+    logger.info("Create evaluator for " + dev_file)
+    src_sentences = []
+    trg_sentences = []
+    with gzip.open(dev_file, "rt", encoding="utf8") if dev_file.endswith(".gz") else open(
+        dev_file, encoding="utf8"
+    ) as fIn:
+        for line in fIn:
+            splits = line.strip().split("\t")
+            if splits[0] != "" and splits[1] != "":
+                src_sentences.append(splits[0])
+                trg_sentences.append(splits[1])
+
+    # Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
+    dev_mse = evaluation.MSEEvaluator(
+        src_sentences,
+        trg_sentences,
+        name=os.path.basename(dev_file),
+        teacher_model=teacher_model,
+        batch_size=inference_batch_size,
+    )
+    evaluators.append(dev_mse)
+
+    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
+    dev_trans_acc = evaluation.TranslationEvaluator(
+        src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
+    )
+    evaluators.append(dev_trans_acc)
+
+
+# Train the model
+student_model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
+    epochs=num_epochs,
+    warmup_steps=num_warmup_steps,
+    evaluation_steps=num_evaluation_steps,
+    output_path=output_path,
+    save_best_model=True,
+    optimizer_params={"lr": 2e-5, "eps": 1e-6, "correct_bias": False},
+)
--- a/examples/training/nli/README.md
+++ b/examples/training/nli/README.md
+# Natural Language Inference
+
+Given two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task of deciding if the premise entails the hypothesis, if they are contradiction or if they are neutral. Commonly used NLI dataset are [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426). 
+
+[Conneau et al.](https://arxiv.org/abs/1705.02364) showed that NLI data can be quite useful when training Sentence Embedding methods. We also found this in our [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) and often use NLI as a first fine-tuning step for sentence embedding methods.
+
+To train on NLI, see the following example files:
+- **[training_nli.py](training_nli.py)** - This example uses the Softmax-Classification-Loss, as described in the [SBERT-Paper](https://arxiv.org/abs/1908.10084), to learn sentence embeddings.
+- **[training_nli_v2.py](training_nli_v2.py)** - The Softmax-Classification-Loss, as used in our original SBERT paper, does not yield optimal performance. A better loss is [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), where we provide pairs or triplets. In that example, we provide a triplet of the format: (anchor, entailment_sentence, contradiction_sentence). The NLI data provides such triplets. The MultipleNegativesRankingLoss yields much higher performances and is more intuitive than the Softmax-Classification-Loss. We have used this loss to train the paraphrase model in our [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) paper.
+- **[training_nli_v3.py](training_nli_v3.py)** - Following the [GISTEmbed](https://arxiv.org/abs/2402.16829) paper, we can modify the in-batch negative selection from [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) using a guiding model. Candidate negative pairs are ignored during training if the guiding model considers the pair to be too similar. In practice, the [GISTEmbedLoss](https://www.sbert.net/docs/package_reference/losses.html#gistembedloss) tends to produce a stronger training signal than `MultipleNegativesRankingLoss` at the cost of some training overhead for running inference on the guiding model.
+
+## Data
+In our experiments we combine [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426), which we call AllNLI. These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:
+
+| Sentence A (Premise) | Sentence B (Hypothesis) | Label |
+| --- | --- | --- |
+| A soccer game with multiple males playing. | Some men are playing a sport. | entailment |
+| An older and younger man smiling. | Two men are smiling and laughing at the cats playing on the floor. | neutral |
+| A man inspects the uniform of a figure in some East Asian country. | The man is sleeping. | contradiction |
+
+
+
+
+
+## SoftmaxLoss
+[Conneau et al.](https://arxiv.org/abs/1705.02364) described how a softmax classifier on top of a siamese network can be used to learn meaningful sentence representation. We can achieve this by using the  [losses.SoftmaxLoss](../../../docs/package_reference/losses.html#softmaxloss) package.
+
+
+The softmax loss looks like this:
+
+![SBERT SoftmaxLoss](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png "SBERT SoftmaxLoss")
+
+We pass the two sentences through our SentenceTransformer network and get the sentence embeddings *u* and *v*. We then concatenate u, v and |u-v| to form one, long vector. This vector is then passed to a softmax classifier, which predicts our three classes (entailment, neutral, contradiction).
+
+This setup learns sentence embeddings, that can later be used for wide variety of tasks. 
+
+## MultipleNegativesRankingLoss
+
+That the softmax-loss with NLI data produces (relatively) good sentence embeddings is rather coincidental. The [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) is much more intuitive and produces also significantly better sentence representations.
+
+The training data for MultipleNegativesRankingLoss consists of sentence pairs [(a<sub>1</sub>, b<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>)] where we assume that (a<sub>i</sub>, b<sub>i</sub>) are similar sentences and (a<sub>i</sub>, b<sub>j</sub>) are dissimilar sentences for i != j. The minimizes the distance between (a<sub>i</sub>, b<sub>i</sub>) while it simultaneously maximizes the distance  (a<sub>i</sub>, b<sub>j</sub>) for all i != j.
+
+
+For example in the following picture:
+
+![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png)
+
+The distance between (a<sub>1</sub>, b<sub>1</sub>) is reduced, while the distance between (a<sub>1</sub>, b<sub>2...5</sub>) will be increased. The same is done for a<sub>2</sub>, ..., a<sub>5</sub>.
+
+
+Using MultipleNegativeRankingLoss with NLI is rather easy: We define sentences that have an *entailment* label as positive pairs. E.g, we have pairs like (*"A soccer game with multiple males playing."*, *"Some men are playing a sport."*) and want that these pairs are close in vector space.
+
+### MultipleNegativesRankingLoss with Hard Negatives
+
+We can further improve MultipleNegativesRankingLoss by not only providing pairs, but by providing triplets: [(a<sub>1</sub>, b<sub>1</sub>, c<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>, c<sub>n</sub>)] 
+
+The entry for c<sub>i</sub> are so-called hard-negatives: On a lexical level, they are similar to a<sub>i</sub> and b<sub>i</sub>. But on a semantic level, they mean different things and should not be close in the vector space.
+
+For NLI data, we can use the contradiction-label to create such triplets with a hard negative. So our triplets look like this:
+("*A soccer game with multiple males playing."*, *"Some men are playing a sport."*, *"A group of men playing a baseball game."*).
+
+We want the sentences *"A soccer game with multiple males playing."* and *"Some men are playing a sport."* to be close in the vector space, while there should be a larger distance between *"A soccer game with multiple males playing."* and "*A group of men playing a baseball game."*.