Update version according to github

b0f4f53a · Rayyyyy · 392df446 · b0f4f53a · b0f4f53a · b0f4f53a
Commit b0f4f53a authored May 29, 2024 by Rayyyyy
20 changed files
--- a/examples/training/ms_marco/eval_cross-encoder-trec-dl.py
+++ b/examples/training/ms_marco/eval_cross-encoder-trec-dl.py
@@ -14,14 +14,16 @@ python eval_cross-encoder-trec-dl.py cross-encoder-model-name
 """

 import gzip
-from collections import defaultdict
 import logging
-import tqdm
-import numpy as np
+import os
 import sys
+from collections import defaultdict
+
+import numpy as np
 import pytrec_eval
-from sentence_transformers import util, CrossEncoder
-import os
+import tqdm
+
+from sentence_transformers import CrossEncoder, util

 data_folder = "trec2019-data"
 os.makedirs(data_folder, exist_ok=True)
@@ -32,7 +34,7 @@ queries_filepath = os.path.join(data_folder, "msmarco-test2019-queries.tsv.gz")
 if not os.path.exists(queries_filepath):
    logging.info("Download " + os.path.basename(queries_filepath))
    util.http_get(
-        "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", queries_filepath
+        "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", queries_filepath
    )

 with gzip.open(queries_filepath, "rt", encoding="utf8") as fIn:
@@ -69,7 +71,8 @@ passage_filepath = os.path.join(data_folder, "msmarco-passagetest2019-top1000.ts
 if not os.path.exists(passage_filepath):
    logging.info("Download " + os.path.basename(passage_filepath))
    util.http_get(
-        "https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz", passage_filepath
+        "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz",
+        passage_filepath,
    )



--- a/examples/training/ms_marco/eval_msmarco.py
+++ b/examples/training/ms_marco/eval_msmarco.py
@@ -6,12 +6,13 @@ Usage:
 python eval_msmarco.py model_name [max_corpus_size_in_thousands]
 """

-from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation, util
 import logging
-import sys
 import os
+import sys
 import tarfile

+from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation, util
+
 #### Just some code to print debug information to stdout
 logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
@@ -42,14 +43,16 @@ if not os.path.exists(collection_filepath) or not os.path.exists(dev_queries_fil
    tar_filepath = os.path.join(data_folder, "collectionandqueries.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download: " + tar_filepath)
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", tar_filepath)
+        util.http_get(
+            "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", tar_filepath
+        )

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)


 if not os.path.exists(qrels_filepath):
-    util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv", qrels_filepath)
+    util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv", qrels_filepath)

 ### Load data


--- a/examples/training/ms_marco/multilingual/translate_queries.py
+++ b/examples/training/ms_marco/multilingual/translate_queries.py
@@ -8,12 +8,14 @@ Usage:
 python translate_queries [target_language]
 """

-import os
-from sentence_transformers import LoggingHandler, util
 import logging
+import os
+import sys
 import tarfile
+
 from easynmt import EasyNMT
-import sys
+
+from sentence_transformers import LoggingHandler, util

 #### Just some code to print debug information to stdout
 logging.basicConfig(
@@ -44,7 +46,7 @@ os.makedirs(data_folder, exist_ok=True)
 train_queries = {}
 qrels_train = os.path.join(data_folder, "qrels.train.tsv")
 if not os.path.exists(qrels_train):
-    util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv", qrels_train)
+    util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv", qrels_train)

 with open(qrels_train) as fIn:
    for line in fIn:
@@ -58,7 +60,7 @@ if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download queries.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

--- a/examples/training/ms_marco/train_bi-encoder_margin-mse.py
+++ b/examples/training/ms_marco/train_bi-encoder_margin-mse.py
-import sys
+import argparse
+import gzip
 import json
-from torch.utils.data import DataLoader
-from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
 import logging
-from datetime import datetime
-import gzip
 import os
-import tarfile
-import tqdm
-from torch.utils.data import Dataset
+import pickle
 import random
+import sys
+import tarfile
+from datetime import datetime
 from shutil import copyfile
-import pickle
-import argparse
+
+import tqdm
+from torch.utils.data import DataLoader, Dataset
+
+from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util

 #### Just some code to print debug information to stdout
 logging.basicConfig(
@@ -89,7 +90,7 @@ if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)
@@ -109,7 +110,7 @@ if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download queries.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)
@@ -165,7 +166,7 @@ with gzip.open(hard_negatives_filepath, "rt") as fIn:
                negs_to_use = args.negs_to_use.split(",")
            else:  # Use all systems
                negs_to_use = list(data["neg"].keys())
-            logging.info("Using negatives from the following systems:", negs_to_use)
+            logging.info("Using negatives from the following systems: {}".format(", ".join(negs_to_use)))

        for system_name in negs_to_use:
            if system_name not in data["neg"]:

--- a/examples/training/ms_marco/train_bi-encoder_mnrl.py
+++ b/examples/training/ms_marco/train_bi-encoder_mnrl.py
@@ -17,19 +17,20 @@ Running this script:
 python train_bi-encoder-v3.py
 """

+import argparse
+import gzip
 import json
-from torch.utils.data import DataLoader
-from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
 import logging
-from datetime import datetime
-import gzip
 import os
+import pickle
+import random
 import tarfile
+from datetime import datetime
+
 import tqdm
-from torch.utils.data import Dataset
-import random
-import pickle
-import argparse
+from torch.utils.data import DataLoader, Dataset
+
+from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util

 #### Just some code to print debug information to stdout
 logging.basicConfig(
@@ -99,7 +100,7 @@ if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)
@@ -119,7 +120,7 @@ if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download queries.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

--- a/examples/training/ms_marco/train_cross-encoder_kd.py
+++ b/examples/training/ms_marco/train_cross-encoder_kd.py
@@ -17,17 +17,18 @@ Running this script:
 python train_cross-encoder-v2.py
 """

-from torch.utils.data import DataLoader
-from sentence_transformers import LoggingHandler, util
-from sentence_transformers.cross_encoder import CrossEncoder
-from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
-from sentence_transformers import InputExample
-import logging
-from datetime import datetime
 import gzip
+import logging
 import os
 import tarfile
+from datetime import datetime
+
 import torch
+from torch.utils.data import DataLoader
+
+from sentence_transformers import InputExample, LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator

 #### Just some code to print debug information to stdout
 logging.basicConfig(
@@ -64,7 +65,7 @@ if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)
@@ -82,7 +83,7 @@ if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download queries.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

--- a/examples/training/ms_marco/train_cross-encoder_scratch.py
+++ b/examples/training/ms_marco/train_cross-encoder_scratch.py
@@ -14,17 +14,18 @@ Running this script:
 python train_cross-encoder.py
 """

-from torch.utils.data import DataLoader
-from sentence_transformers import LoggingHandler, util
-from sentence_transformers.cross_encoder import CrossEncoder
-from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
-from sentence_transformers import InputExample
-import logging
-from datetime import datetime
 import gzip
+import logging
 import os
 import tarfile
+from datetime import datetime
+
 import tqdm
+from torch.utils.data import DataLoader
+
+from sentence_transformers import InputExample, LoggingHandler, util
+from sentence_transformers.cross_encoder import CrossEncoder
+from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator

 #### Just some code to print debug information to stdout
 logging.basicConfig(
@@ -71,7 +72,7 @@ if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, "collection.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)
@@ -89,7 +90,7 @@ if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, "queries.tar.gz")
    if not os.path.exists(tar_filepath):
        logging.info("Download queries.tar.gz")
-        util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
+        util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

--- a/examples/training/multilingual/README.md
+++ b/examples/training/multilingual/README.md
-# Multilingual-Models
+# Multilingual Models
 The issue with multilingual BERT (mBERT) as well as with XLM-RoBERTa is that those produce rather bad sentence representation out-of-the-box. Further, the vectors spaces between languages are not aligned, i.e., the sentences with the same content in different languages would be mapped to different locations in the vector space.

-In my publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) I describe any easy approach to extend sentence embeddings to further languages.
+In my publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) I describe an easy approach to extend sentence embeddings to further languages.

 Chien Vu also wrote a nice blog article on this technique: [A complete guide to transfer learning from English to other Languages using Sentence Embeddings BERT Models](https://towardsdatascience.com/a-complete-guide-to-transfer-learning-from-english-to-other-languages-using-sentence-embeddings-8c427f8804a9)

-## Available Pre-trained Models
-For a list of available models, see [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).
+## Extend your own models
+![Multilingual Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/multilingual-distillation.png)
+
+The idea is based on a fixed (monolingual) **teacher model** that produces sentence embeddings with our desired properties in one language (e.g. English). The **student model** is supposed to mimic the teacher model, i.e., the same English sentence should be mapped to the same vector by the teacher and by the student model. Additionally, in order to make the student model work for other languages, we train the student model on parallel (translated) sentences. The translation of each sentence should also be mapped to the same vector as the original sentence.
+
+In the above figure, the student model should map *Hello World* and the German translation *Hallo Welt* to the vector of ``teacher_model('Hello World')``. We achieve this by training the student model using mean squared error (MSE) loss.
+
+In our experiments we initialized the student model with the multilingual [XLM-RoBERTa model](https://huggingface.co/FacebookAI/xlm-roberta-base). 
+
+## Training 
+For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py). 
+
+This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages.
+
+## Datasets
+
+```eval_rst
+As training data we require parallel sentences, i.e., sentences translated in various languages. In particular, we will use :class:`~datasets.Dataset` instances with ``"english"`` and ``"non_english"`` columns. We have prepared a large collection of such datasets in our `Parallel Sentences dataset collection <https://huggingface.co/collections/sentence-transformers/parallel-sentences-datasets-6644d644123d31ba5b1c8785>`_.
+```
+
+The training script will take the `"english"` column and add a `"label"` column containing the embeddings of the english texts. Then, the student model `"english"` and `"non_english"` will be trained to be similar to this `"label"`. You can load such a training dataset like so:
+
+```python
+from datasets import load_dataset
+
+train_dataset = load_dataset("sentence-transformers/parallel-sentences-talks", "en-de", split="train")
+print(train_dataset[0])
+# {"english": "So I think practicality is one case where it's worth teaching people by hand.", "non_english": "Ich denke, dass es sich aus diesem Grund lohnt, den Leuten das Rechnen von Hand beizubringen."}
+```
+
+## Sources for Training Data
+A great website for a vast number of parallel (translated) datasets is [OPUS](http://opus.nlpl.eu/). There, you find parallel datasets for more than 400 languages. You can use these to create your own parallel sentence datasets, if you wish.
+
+## Evaluation
+
+Training can be evaluated in different ways. For an example how to use these evaluation methods, see [make_multilingual.py](make_multilingual.py). 
+
+### MSE Evaluation
+You can measure the mean squared error (MSE) between the student embeddings and teacher embeddings.
+
+```python
+from datasets import load_dataset

+eval_dataset = load_dataset("sentence-transformers/parallel-sentences-talks", "en-fr", split="dev")
+
+dev_mse = MSEEvaluator(
+    source_sentences=eval_dataset["english"],
+    target_sentences=eval_dataset["non_english"],
+    name="en-fr-dev",
+    teacher_model=teacher_model,
+    batch_size=32,
+)
+```
+
+This evaluator computes the teacher embeddings for the `source_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `target_sentences`, for example, for French. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance.
+
+### Translation Accuracy
+You can also measure the translation accuracy. As inputs, this evaluator accepts a list of `source_sentences` (e.g. English), and a list of `target_sentences` (e.g. Spanish), such that `target_sentences[i]` is a translation of `source_sentences[i]`.
+
+For each sentence pair, we check if `source_sentences[i]` we check if `target_sentences[i]` has the highest similarity out of all target sentences. If this is the case, we have a hit, otherwise an error. This evaluator reports accuracy (higher = better). 
+
+```python
+from datasets import load_dataset
+
+eval_dataset = load_dataset("sentence-transformers/parallel-sentences-talks", "en-fr", split="dev")
+
+dev_trans_acc = TranslationEvaluator(
+    source_sentences=eval_dataset["english"],
+    target_sentences=eval_dataset["non_english"],
+    name="en-fr-dev",
+    batch_size=32,
+)
+```
+
+### Multilingual Semantic Textual Similarity
+You can also measure the semantic textual similarity (STS) between sentence pairs in different languages:
+
+```python
+from datasets import load_dataset
+
+test_dataset = load_dataset("mteb/sts17-crosslingual-sts", "nl-en", split="test")
+
+test_emb_similarity = EmbeddingSimilarityEvaluator(
+    sentences1=test_dataset["sentence1"],
+    sentences2=test_dataset["sentence2"],
+    scores=[score / 5.0 for score in test_dataset["score"]],  # Convert 0-5 scores to 0-1 scores
+    batch_size=32,
+    name=f"sts17-nl-en-test",
+    show_progress_bar=False,
+)
+```
+
+Where `sentences1` and `sentences2` are lists of sentences and score is numeric value indicating the semantic similarity between `sentences1[i]` and `sentences2[i]`.
+
+## Available Pre-trained Models
+For a list of available models, see [Pretrained Models](../../../docs/sentence_transformer/pretrained_models.html#multilingual-models).

 ## Usage
 You can use the models in the following way:
+
 ```python
 from sentence_transformers import SentenceTransformer

-embedder = SentenceTransformer("model-name")
-embeddings = embedder.encode(["Hello World", "Hallo Welt", "Hola mundo"])
-print(embeddings)
+model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+embeddings = model.encode(["Hello World", "Hallo Welt", "Hola mundo", "Bye, Moon!"])
+similarities = model.similarity(embeddings, embeddings)
+# tensor([[1.0000, 0.9429, 0.8880, 0.4558],
+#         [0.9429, 1.0000, 0.9680, 0.5307],
+#         [0.8880, 0.9680, 1.0000, 0.4933],
+#         [0.4558, 0.5307, 0.4933, 1.0000]])
 ```

-
 ## Performance
 The performance was evaluated on the [Semantic Textual Similarity (STS) 2017 dataset](http://ixa2.si.ehu.es/stswiki/index.php/Main_Page). The task is to predict the semantic similarity (on a scale 0-5) of two given sentences. STS2017 has monolingual test data for English, Arabic, and Spanish, and cross-lingual test data for English-Arabic, -Spanish and -Turkish.

@@ -101,105 +198,6 @@ We extended the STS2017 and added cross-lingual test data for English-German, Fr
    </tr>
 </table>

-
-## Extend your own models
-![Multilingual Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/multilingual-distillation.png)
-
-The idea is based on a fixed (monolingual) **teacher model**, that produces sentence embeddings with our desired properties in one language. The **student model** is supposed to mimic the teacher model, i.e., the same English sentence should be mapped to the same vector by the teacher and by the student model. In order that the student model works for further languages, we train the student model on parallel (translated) sentences. The translation of each sentence should also be mapped to the same vector as the original sentence.
-
-In the above figure, the student model should map *Hello World* and the German translation *Hallo Welt* to the vector of *teacher_model('Hello World')*. We achieve this by training the student model using mean squared error (MSE) loss.
-
-In our experiments we initialized the student model with the multilingual XLM-RoBERTa model. 
-
-## Training 
-For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py). 
-
-This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages.
-
-
-
-## Data Format
-
-As training data we require parallel sentences, i.e., sentences translated in various languages. As data format, we use a tab-separated .tsv file. In the first column, you have your source sentence, for example, an English sentence. In the following columns, you have the translations of this source sentence. If you have multiple translations per source sentence, you can put them in the same line or in different lines.
-```
-Source_sentence Target_lang1    Target_lang2    Target_lang3
-Source_sentence Target_lang1    Target_lang2
-```
-
-An example file could look like this (EN DE ES):
-```
-Hello World Hallo Welt  Hola Mundo
-Sentences are separated with a tab character.    Die Sätze sind per Tab getrennt.    Las oraciones se separan con un carácter de tabulación.
-```
-
-The order of the translations are not important, it is only important that the first column contains a sentence in a language that is understood by the teacher model.
-
-## Loading Training Datasets
-
-You can load such a training file using the *ParallelSentencesDataset* class:
-```python
-from sentence_transformers.datasets import ParallelSentencesDataset
-
-train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model)
-train_data.load_data("path/to/tab/separated/train-en-de.tsv")
-train_data.load_data("path/to/tab/separated/train-en-es.tsv.gz")
-train_data.load_data("path/to/tab/separated/train-en-fr.tsv.gz")
-
-train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
-train_loss = losses.MSELoss(model=student_model)
-```
-
-You load a file with the *load_data()* method. You can load multiple files by calling load_data multiple times. You can also regular files or .gz-compressed files.
-
-Per default, all datasets are weighted equally. In the above example a (source, translation)-pair will be sampled equally from all three datasets. If you pass a `weight` parameter (integer), you can weight some datasets higher or lower.
-
-## Sources for Training Data
-A great website for a vast number of parallel (translated) datasets is [OPUS](http://opus.nlpl.eu/). There, you find parallel datasets for more than 400 languages. 
-
-The [examples/training/multilingual](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/) folder contains some scripts that downloads parallel training data and brings it into the right format:
- [get_parallel_data_opus.py](get_parallel_data_opus.py): This script downloads data from the [OPUS](http://opus.nlpl.eu/) website.
- [get_parallel_data_tatoeba.py](get_parallel_data_tatoeba.py): This script downloads data from the [Tatoeba](https://tatoeba.org/) website, a website for language learners with example sentences for more than many languages.
- [get_parallel_data_talks.py](get_parallel_data_talks.py): This script downloads data the parallel sentences corpus, which contains transcripts and translations of more than 4,000 talks in 100+ languages.
-
-## Evaluation
-
-Training can be evaluated in different ways. For an example how to use these evaluation methods, see [make_multilingual.py](make_multilingual.py). 
-
-### MSE Evaluation
-You can measure the mean squared error (MSE) between the student embeddings and teacher embeddings. This can be achieved with the ``
-
-```python
-# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
-dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model)
-```
-
-This evaluator computes the teacher embeddings for the `src_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `trg_sentences`, for example, for Spanish. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance.
-
-### Translation Accuracy
-You can also measure the translation accuracy. Given a list with source sentences, for example, 1000 English sentences. And a list with matching target (translated) sentences, for example, 1000 Spanish sentences.
-
-For each sentence pair, we check if their embeddings are the closest using cosine similarity. I.e., for each `src_sentences[i]` we check if `trg_sentences[i]` has the highest similarity out of all target sentences. If this is the case, we have a hit, otherwise an error. This evaluator reports accuracy (higher = better). 
-
-```python
-# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
-dev_trans_acc = evaluation.TranslationEvaluator(
-    src_sentences,
-    trg_sentences,
-    name=os.path.basename(dev_file),
-    batch_size=inference_batch_size,
-)
-```
-
-### Multi-Lingual Semantic Textual Similarity
-You can also measure the semantic textual similarity (STS) between sentence pairs in different languages:
-
-```python
-sts_evaluator = evaluation.EmbeddingSimilarityEvaluatorFromList(sentences1, sentences2, scores)
-```
-
-Where `sentences1` and `sentences2` are lists of sentences and score is numeric value indicating the semantic similarity between `sentences1[i]` and `sentences2[i]`.
-
-
 ## Citation
 If you use the code for multilingual models, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813):
 ``` 

--- a/examples/training/multilingual/get_parallel_data_opus.py
+++ b/examples/training/multilingual/get_parallel_data_opus.py
@@ -29,9 +29,9 @@ This python code automates the download and creation of the parallel sentences f

 """

-from opustools import OpusRead
 import os

+from opustools import OpusRead

 corpora = ["JW300"]  # Corpora you want to use
 source_languages = ["en"]  # Source language, our teacher model is able to understand

--- a/examples/training/multilingual/get_parallel_data_talks.py
+++ b/examples/training/multilingual/get_parallel_data_talks.py
@@ -6,19 +6,20 @@ The parallel sentences corpus is a crawl of transcripts from talks, which are tr

 The parallel sentences corpus cannot be downloaded automatically. It is available for research purposes only (CC-BY-NC).

-The training procedure can be found in the files make_multilingual.py and make_multilingual_sys.py.
+The training procedure can be found in the files make_multilingual.py.

 Further information can be found in our paper:
 Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
 https://arxiv.org/abs/2004.09813
 """

-import os
-import sentence_transformers.util
-import gzip
 import csv
+import gzip
+import os
+
 from tqdm.autonotebook import tqdm

+import sentence_transformers.util

 source_languages = set(["en"])  # Languages our (monolingual) teacher model understands
 target_languages = set(["de", "es", "it", "fr", "ar", "tr"])  # New languages we want to extend to

--- a/examples/training/multilingual/get_parallel_data_tatoeba.py
+++ b/examples/training/multilingual/get_parallel_data_tatoeba.py
@@ -5,10 +5,11 @@ It is available for more than 300 languages.
 This script downloads the Tatoeba corpus and extracts the sentences & translations in the languages you like
 """

+import gzip
 import os
-import sentence_transformers
 import tarfile
-import gzip
+
+import sentence_transformers

 # Note: Tatoeba uses 3 letter languages codes (ISO-639-2),
 # while other datasets like OPUS use 2 letter language codes (ISO-639-1)

--- a/examples/training/multilingual/get_parallel_data_wikimatrix.py
+++ b/examples/training/multilingual/get_parallel_data_wikimatrix.py
@@ -9,10 +9,10 @@ Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
 https://arxiv.org/abs/2004.09813
 """

-import os
-import sentence_transformers.util
 import gzip
+import os

+import sentence_transformers.util

 source_languages = set(["en"])  # Languages our (monolingual) teacher model understands
 target_languages = set(["de", "es", "it", "fr", "ar", "tr"])  # New languages we want to extend to

--- a/examples/training/multilingual/make_multilingual.py
+++ b/examples/training/multilingual/make_multilingual.py
@@ -17,20 +17,23 @@ Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
 https://arxiv.org/abs/2004.09813
 """

-from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
-from torch.utils.data import DataLoader
-from sentence_transformers.datasets import ParallelSentencesDataset
+import logging
+import traceback
 from datetime import datetime

-import os
-import logging
-import sentence_transformers.util
-import csv
-import gzip
-from tqdm.autonotebook import tqdm
 import numpy as np
-import zipfile
-import io
+
+from datasets import DatasetDict, load_dataset
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import (
+    EmbeddingSimilarityEvaluator,
+    MSEEvaluator,
+    SequentialEvaluator,
+    TranslationEvaluator,
+)
+from sentence_transformers.losses import MSELoss
+from sentence_transformers.trainer import SentenceTransformerTrainer
+from sentence_transformers.training_args import SentenceTransformerTrainingArguments

 logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
@@ -38,220 +41,205 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)


-teacher_model_name = (
-    "paraphrase-distilroberta-base-v2"  # Our monolingual teacher model, we want to convert to multiple languages
-)
-student_model_name = "xlm-roberta-base"  # Multilingual base model we use to imitate the teacher model
+# The teacher model is monolingual, we use it for English embeddings
+teacher_model_name = "paraphrase-distilroberta-base-v2"
+# The student model is multilingual, we train it such that embeddings of non-English texts mimic the teacher model's English embeddings
+student_model_name = "xlm-roberta-base"

-max_seq_length = 128  # Student model max. lengths for inputs (number of word pieces)
+student_max_seq_length = 128  # Student model max. lengths for inputs (number of word pieces)
 train_batch_size = 64  # Batch size for training
 inference_batch_size = 64  # Batch size at inference
 max_sentences_per_language = 500000  # Maximum number of  parallel sentences for training
-train_max_sentence_length = 250  # Maximum length (characters) for parallel training sentences

-num_epochs = 5  # Train for x epochs
-num_warmup_steps = 10000  # Warumup steps
-
-num_evaluation_steps = 1000  # Evaluate performance after every xxxx steps
-dev_sentences = 1000  # Number of parallel sentences to be used for development
+num_train_epochs = 5  # Train for x epochs
+num_evaluation_steps = 5000  # Evaluate performance after every xxxx steps


 # Define the language codes you would like to extend the model to
 source_languages = set(["en"])  # Our teacher model accepts English (en) sentences
-target_languages = set(
-    ["de", "es", "it", "fr", "ar", "tr"]
-)  # We want to extend the model to these new languages. For language codes, see the header of the train file
+# We want to extend the model to these new languages. For language codes, see the header of the train file
+target_languages = set(["de", "es", "it", "fr", "ar", "tr"])


-output_path = (
+output_dir = (
    "output/make-multilingual-"
    + "-".join(sorted(list(source_languages)) + sorted(list(target_languages)))
    + "-"
    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 )

-
-# This function downloads a corpus if it does not exist
-def download_corpora(filepaths):
-    if not isinstance(filepaths, list):
-        filepaths = [filepaths]
-
-    for filepath in filepaths:
-        if not os.path.exists(filepath):
-            print(filepath, "does not exists. Try to download from server")
-            filename = os.path.basename(filepath)
-            url = "https://sbert.net/datasets/" + filename
-            sentence_transformers.util.http_get(url, filepath)
-
-
-# Here we define train train and dev corpora
-train_corpus = "datasets/parallel-sentences.tsv.gz"
-sts_corpus = "datasets/stsbenchmark.zip"
-parallel_sentences_folder = "parallel-sentences/"
-
-# Check if the file exists. If not, they are downloaded
-download_corpora([train_corpus, sts_corpus])
-
-
-# Create parallel files for the selected language combinations
-os.makedirs(parallel_sentences_folder, exist_ok=True)
-train_files = []
-dev_files = []
-files_to_create = []
+# 1a. Here we define our SentenceTransformer teacher model.
+teacher_model = SentenceTransformer(teacher_model_name)
+# If we want, we can limit the maximum sequence length for the model
+# teacher_model.max_seq_length = 128
+logging.info(f"Teacher model: {teacher_model}")
+
+# 1b. Here we define our SentenceTransformer student model. If not already a Sentence Transformer model,
+# it will automatically create one with "mean" pooling.
+student_model = SentenceTransformer(student_model_name)
+# If we want, we can limit the maximum sequence length for the model
+student_model.max_seq_length = student_max_seq_length
+logging.info(f"Student model: {student_model}")
+
+# 2. Load the parallel sentences training dataset: https://huggingface.co/datasets?other=sentence-transformers&sort=trending&search=parallel-sentences
+# NOTE: We can also use multiple datasets if we want
+dataset_to_use = "sentence-transformers/parallel-sentences-talks"
+# dataset_to_use = "sentence-transformers/parallel-sentences-europarl"
+# dataset_to_use = "sentence-transformers/parallel-sentences-global-voices"
+# dataset_to_use = "sentence-transformers/parallel-sentences-muse"
+# dataset_to_use = "sentence-transformers/parallel-sentences-jw300"
+# dataset_to_use = "sentence-transformers/parallel-sentences-news-commentary"
+# dataset_to_use = "sentence-transformers/parallel-sentences-opensubtitles"
+# dataset_to_use = "sentence-transformers/parallel-sentences-tatoeba"
+# dataset_to_use = "sentence-transformers/parallel-sentences-wikimatrix"
+# dataset_to_use = "sentence-transformers/parallel-sentences-wikititles"
+train_dataset_dict = DatasetDict()
+eval_dataset_dict = DatasetDict()
 for source_lang in source_languages:
    for target_lang in target_languages:
-        output_filename_train = os.path.join(
-            parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
-        )
-        output_filename_dev = os.path.join(
-            parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
+        subset = f"{source_lang}-{target_lang}"
+        try:
+            train_dataset = load_dataset(dataset_to_use, subset, split="train")
+            if len(train_dataset) > max_sentences_per_language:
+                train_dataset = train_dataset.select(range(max_sentences_per_language))
+        except Exception as exc:
+            logging.error(f"Could not load dataset {dataset_to_use}/{source_lang}-{target_lang}: {exc}")
+            continue
+
+        try:
+            eval_dataset = load_dataset(dataset_to_use, subset, split="dev")
+            if len(eval_dataset) > 1000:
+                eval_dataset = eval_dataset.select(range(1000))
+        except Exception:
+            logging.info(
+                f"Could not load dataset {dataset_to_use}/{source_lang}-{target_lang} dev split, splitting 1k samples from train"
            )
-        train_files.append(output_filename_train)
-        dev_files.append(output_filename_dev)
-        if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
-            files_to_create.append(
-                {
-                    "src_lang": source_lang,
-                    "trg_lang": target_lang,
-                    "fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
-                    "fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
-                    "devCount": 0,
+            dataset = train_dataset.train_test_split(test_size=1000, shuffle=True)
+            train_dataset = dataset["train"]
+            eval_dataset = dataset["test"]
+
+        train_dataset_dict[subset] = train_dataset
+        eval_dataset_dict[subset] = eval_dataset
+logging.info(train_dataset_dict)
+
+
+# We want the student EN embeddings to be similar to the teacher EN embeddings and
+# the student non-EN embeddings to be similar to the teacher EN embeddings
+def prepare_dataset(batch):
+    return {
+        "english": batch["english"],
+        "non_english": batch["non_english"],
+        "label": teacher_model.encode(batch["english"], batch_size=inference_batch_size, show_progress_bar=False),
    }
-            )
-
-if len(files_to_create) > 0:
-    print(
-        "Parallel sentences files {} do not exist. Create these files now".format(
-            ", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
-        )
-    )
-    with gzip.open(train_corpus, "rt", encoding="utf8") as fIn:
-        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-        for line in tqdm(reader, desc="Sentences"):
-            for outfile in files_to_create:
-                src_text = line[outfile["src_lang"]].strip()
-                trg_text = line[outfile["trg_lang"]].strip()
-
-                if src_text != "" and trg_text != "":
-                    if outfile["devCount"] < dev_sentences:
-                        outfile["devCount"] += 1
-                        fOut = outfile["fDev"]
-                    else:
-                        fOut = outfile["fTrain"]
-
-                    fOut.write("{}\t{}\n".format(src_text, trg_text))
-
-    for outfile in files_to_create:
-        outfile["fTrain"].close()
-        outfile["fDev"].close()
-
-
-######## Start the extension of the teacher model to multiple languages ########
-logger.info("Load teacher model")
-teacher_model = SentenceTransformer(teacher_model_name)


-logger.info("Create student model from scratch")
-word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
-# Apply mean pooling to get one fixed sized sentence vector
-pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
-student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-
-
-###### Read Parallel Sentences Dataset ######
-train_data = ParallelSentencesDataset(
-    student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
+column_names = list(train_dataset_dict.values())[0].column_names
+train_dataset_dict = train_dataset_dict.map(
+    prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names
 )
-for train_file in train_files:
-    train_data.load_data(
-        train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length
-    )
-
-train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
-train_loss = losses.MSELoss(model=student_model)
+logging.info("Prepared datasets for training:", train_dataset_dict)

+# 3. Define our training loss
+# MSELoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#mseloss) needs one text columns and one
+# column with embeddings from the teacher model
+train_loss = MSELoss(model=student_model)

-#### Evaluate cross-lingual performance on different tasks #####
-evaluators = []  # evaluators has a list of different evaluator classes we call periodically
+# 4. Define evaluators for use during training. This is useful to keep track of alongside the evaluation loss.
+evaluators = []

-for dev_file in dev_files:
-    logger.info("Create evaluator for " + dev_file)
-    src_sentences = []
-    trg_sentences = []
-    with gzip.open(dev_file, "rt", encoding="utf8") as fIn:
-        for line in fIn:
-            splits = line.strip().split("\t")
-            if splits[0] != "" and splits[1] != "":
-                src_sentences.append(splits[0])
-                trg_sentences.append(splits[1])
+for subset, eval_dataset in eval_dataset_dict.items():
+    logger.info(f"Creating evaluators for {subset}")

    # Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
-    dev_mse = evaluation.MSEEvaluator(
-        src_sentences,
-        trg_sentences,
-        name=os.path.basename(dev_file),
+    dev_mse = MSEEvaluator(
+        source_sentences=eval_dataset["english"],
+        target_sentences=eval_dataset["non_english"],
+        name=subset,
        teacher_model=teacher_model,
        batch_size=inference_batch_size,
    )
    evaluators.append(dev_mse)

-    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
-    dev_trans_acc = evaluation.TranslationEvaluator(
-        src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
+    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of
+    # source[i] is the closest to target[i] out of all available target sentences
+    dev_trans_acc = TranslationEvaluator(
+        source_sentences=eval_dataset["english"],
+        target_sentences=eval_dataset["non_english"],
+        name=subset,
+        batch_size=inference_batch_size,
    )
    evaluators.append(dev_trans_acc)

-
-##### Read cross-lingual Semantic Textual Similarity (STS) data ####
-all_languages = list(set(list(source_languages) + list(target_languages)))
-sts_data = {}
-
-# Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
-with zipfile.ZipFile(sts_corpus) as zip:
-    filelist = zip.namelist()
-    sts_files = []
-
-    for i in range(len(all_languages)):
-        for j in range(i, len(all_languages)):
-            lang1 = all_languages[i]
-            lang2 = all_languages[j]
-            filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
-            if filepath not in filelist:
-                lang1, lang2 = lang2, lang1
-                filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
-
-            if filepath in filelist:
-                filename = os.path.basename(filepath)
-                sts_data[filename] = {"sentences1": [], "sentences2": [], "scores": []}
-
-                fIn = zip.open(filepath)
-                for line in io.TextIOWrapper(fIn, "utf8"):
-                    sent1, sent2, score = line.strip().split("\t")
-                    score = float(score)
-                    sts_data[filename]["sentences1"].append(sent1)
-                    sts_data[filename]["sentences2"].append(sent2)
-                    sts_data[filename]["scores"].append(score)
-
-for filename, data in sts_data.items():
-    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
-        data["sentences1"],
-        data["sentences2"],
-        data["scores"],
+    # Try to load this subset from STS17
+    test_dataset = None
+    try:
+        test_dataset = load_dataset("mteb/sts17-crosslingual-sts", subset, split="test")
+    except Exception:
+        try:
+            test_dataset = load_dataset("mteb/sts17-crosslingual-sts", f"{subset[3:]}-{subset[:2]}", split="test")
+            subset = f"{subset[3:]}-{subset[:2]}"
+        except Exception:
+            pass
+    if test_dataset:
+        test_evaluator = EmbeddingSimilarityEvaluator(
+            sentences1=test_dataset["sentence1"],
+            sentences2=test_dataset["sentence2"],
+            scores=[score / 5.0 for score in test_dataset["score"]],  # Convert 0-5 scores to 0-1 scores
            batch_size=inference_batch_size,
-        name=filename,
+            name=f"sts17-{subset}-test",
            show_progress_bar=False,
        )
        evaluators.append(test_evaluator)

+evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
+# Now also prepare the evaluation datasets for training
+eval_dataset_dict = eval_dataset_dict.map(prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names)
+
+# 5. Define the training arguments
+args = SentenceTransformerTrainingArguments(
+    # Required parameter:
+    output_dir=output_dir,
+    # Optional training parameters:
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=train_batch_size,
+    per_device_eval_batch_size=train_batch_size,
+    warmup_ratio=0.1,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    learning_rate=2e-5,
+    # Optional tracking/debugging parameters:
+    eval_strategy="steps",
+    eval_steps=num_evaluation_steps,
+    save_strategy="steps",
+    save_steps=num_evaluation_steps,
+    save_total_limit=2,
+    logging_steps=100,
+    run_name=f"multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}",  # Will be used in W&B if `wandb` is installed
+)

-# Train the model
-student_model.fit(
-    train_objectives=[(train_dataloader, train_loss)],
-    evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
-    epochs=num_epochs,
-    warmup_steps=num_warmup_steps,
-    evaluation_steps=num_evaluation_steps,
-    output_path=output_path,
-    save_best_model=True,
-    optimizer_params={"lr": 2e-5, "eps": 1e-6},
+# 6. Create the trainer & start training
+trainer = SentenceTransformerTrainer(
+    model=student_model,
+    args=args,
+    train_dataset=train_dataset_dict,
+    eval_dataset=eval_dataset_dict,
+    loss=train_loss,
+    evaluator=evaluator,
 )
+trainer.train()
+
+# 7. Save the trained & evaluated model locally
+final_output_dir = f"{output_dir}/final"
+student_model.save(final_output_dir)
+
+# 8. (Optional) save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = student_model_name if "/" not in student_model_name else student_model_name.split("/")[-1]
+try:
+    student_model.push_to_hub(f"{model_name}-multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}")
+except Exception:
+    logging.error(
+        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}')`."
+    )
--- a/examples/training/nli/README.md
+++ b/examples/training/nli/README.md
 # Natural Language Inference

-Given two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task of deciding if the premise entails the hypothesis, if they are contradiction or if they are neutral. Commonly used NLI dataset are [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426). 
+Given two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task of deciding if the premise entails the hypothesis, if they are contradiction, or if they are neutral. Commonly used NLI dataset are [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli). 

 [Conneau et al.](https://arxiv.org/abs/1705.02364) showed that NLI data can be quite useful when training Sentence Embedding methods. We also found this in our [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) and often use NLI as a first fine-tuning step for sentence embedding methods.

 To train on NLI, see the following example files:
- **[training_nli.py](training_nli.py)** - This example uses the Softmax-Classification-Loss, as described in the [SBERT-Paper](https://arxiv.org/abs/1908.10084), to learn sentence embeddings.
- **[training_nli_v2.py](training_nli_v2.py)** - The Softmax-Classification-Loss, as used in our original SBERT paper, does not yield optimal performance. A better loss is [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), where we provide pairs or triplets. In that example, we provide a triplet of the format: (anchor, entailment_sentence, contradiction_sentence). The NLI data provides such triplets. The MultipleNegativesRankingLoss yields much higher performances and is more intuitive than the Softmax-Classification-Loss. We have used this loss to train the paraphrase model in our [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) paper.
- **[training_nli_v3.py](training_nli_v3.py)** - Following the [GISTEmbed](https://arxiv.org/abs/2402.16829) paper, we can modify the in-batch negative selection from [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) using a guiding model. Candidate negative pairs are ignored during training if the guiding model considers the pair to be too similar. In practice, the [GISTEmbedLoss](https://www.sbert.net/docs/package_reference/losses.html#gistembedloss) tends to produce a stronger training signal than `MultipleNegativesRankingLoss` at the cost of some training overhead for running inference on the guiding model.
+1. **[training_nli.py](training_nli.py)**:
+    ```eval_rst
+    This example uses :class:`~sentence_transformers.losses.SoftmaxLoss` as described in the original [Sentence Transformers paper](https://arxiv.org/abs/1908.10084).
+    ```
+2. **[training_nli_v2.py](training_nli_v2.py)**:
+    ```eval_rst
+    The :class:`~sentence_transformers.losses.SoftmaxLoss` as used in our original SBERT paper does not yield optimal performance. A better loss is :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss`, where we provide pairs or triplets. In this script, we provide a triplet of the format: (anchor, entailment_sentence, contradiction_sentence). The NLI data provides such triplets. The :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` yields much higher performances and is more intuitive than :class:`~sentence_transformers.losses.SoftmaxLoss`. We have used this loss to train the paraphrase model in our `Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation <https://arxiv.org/abs/2004.09813>`_ paper.
+    ```
+3. **[training_nli_v3.py](training_nli_v3.py)**
+    ```eval_rst
+    Following the `GISTEmbed <https://arxiv.org/abs/2402.16829>`_ paper, we can modify the in-batch negative selection from :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` using a guiding model. Candidate negative pairs are ignored during training if the guiding model considers the pair to be too similar. In practice, the :class:`~sentence_transformers.losses.GISTEmbedLoss` tends to produce a stronger training signal than :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` at the cost of some training overhead for running inference on the guiding model.
+    ```

 ## Data
-In our experiments we combine [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426), which we call AllNLI. These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:
+We combine [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli) into a dataset we call [AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli). These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:

 | Sentence A (Premise) | Sentence B (Hypothesis) | Label |
 | --- | --- | --- |
@@ -18,45 +27,45 @@ In our experiments we combine [SNLI](https://arxiv.org/abs/1508.05326) and [Mult
 | An older and younger man smiling. | Two men are smiling and laughing at the cats playing on the floor. | neutral |
 | A man inspects the uniform of a figure in some East Asian country. | The man is sleeping. | contradiction |

-
-
-
+We format AllNLI in a few different subsets, compatible with different loss functions. See for example the [triplet subset of AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet).

 ## SoftmaxLoss
-[Conneau et al.](https://arxiv.org/abs/1705.02364) described how a softmax classifier on top of a siamese network can be used to learn meaningful sentence representation. We can achieve this by using the  [losses.SoftmaxLoss](../../../docs/package_reference/losses.html#softmaxloss) package.
+```eval_rst
+`Conneau et al. <https://arxiv.org/abs/1705.02364>`_ described how a softmax classifier on top of a `siamese network <https://en.wikipedia.org/wiki/Siamese_neural_network>`_ can be used to learn meaningful sentence representation. We can achieve this by using :class:`~sentence_transformers.losses.SoftmaxLoss`:
+```

+<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png" alt="SBERT SoftmaxLoss" width="250"/>

-The softmax loss looks like this:
+We pass the two sentences through our SentenceTransformer model and get the sentence embeddings *u* and *v*. We then concatenate *u*, *v* and *|u-v|* to form one long vector. This vector is then passed to a softmax classifier, which predicts our three classes (entailment, neutral, contradiction).

-![SBERT SoftmaxLoss](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png "SBERT SoftmaxLoss")
-
-We pass the two sentences through our SentenceTransformer network and get the sentence embeddings *u* and *v*. We then concatenate u, v and |u-v| to form one, long vector. This vector is then passed to a softmax classifier, which predicts our three classes (entailment, neutral, contradiction).
-
-This setup learns sentence embeddings, that can later be used for wide variety of tasks. 
+This setup learns sentence embeddings that can later be used for wide variety of tasks. 

 ## MultipleNegativesRankingLoss
+```eval_rst
+That the :class:`~sentence_transformers.losses.SoftmaxLoss` with NLI data produces (relatively) good sentence embeddings is rather coincidental. The :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` is much more intuitive and produces significantly better sentence representations.
+```

-That the softmax-loss with NLI data produces (relatively) good sentence embeddings is rather coincidental. The [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) is much more intuitive and produces also significantly better sentence representations.
-
-The training data for MultipleNegativesRankingLoss consists of sentence pairs [(a<sub>1</sub>, b<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>)] where we assume that (a<sub>i</sub>, b<sub>i</sub>) are similar sentences and (a<sub>i</sub>, b<sub>j</sub>) are dissimilar sentences for i != j. The minimizes the distance between (a<sub>i</sub>, b<sub>i</sub>) while it simultaneously maximizes the distance  (a<sub>i</sub>, b<sub>j</sub>) for all i != j.
+The training data for MultipleNegativesRankingLoss consists of sentence pairs [(a<sub>1</sub>, b<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>)] where we assume that (a<sub>i</sub>, b<sub>i</sub>) are similar sentences and (a<sub>i</sub>, b<sub>j</sub>) are dissimilar sentences for i != j. The minimizes the distance between (a<sub>i</sub>, b<sub>i</sub>) while it simultaneously maximizes the distance (a<sub>i</sub>, b<sub>j</sub>) for all i != j. For example, in the following picture:

-
-For example in the following picture:
-
-![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png)
+<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png" alt="SBERT MultipleNegativeRankingLoss" width="350"/>

 The distance between (a<sub>1</sub>, b<sub>1</sub>) is reduced, while the distance between (a<sub>1</sub>, b<sub>2...5</sub>) will be increased. The same is done for a<sub>2</sub>, ..., a<sub>5</sub>.

-
-Using MultipleNegativeRankingLoss with NLI is rather easy: We define sentences that have an *entailment* label as positive pairs. E.g, we have pairs like (*"A soccer game with multiple males playing."*, *"Some men are playing a sport."*) and want that these pairs are close in vector space.
+```eval_rst
+Using :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` with NLI is rather easy: We define sentences that have an *entailment* label as positive pairs. E.g, we have pairs like (*"A soccer game with multiple males playing."*, *"Some men are playing a sport."*) and want that these pairs are close in vector space. The `pair subset of AllNLI <https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/pair>`_ has been prepared in this format.
+```

 ### MultipleNegativesRankingLoss with Hard Negatives

-We can further improve MultipleNegativesRankingLoss by not only providing pairs, but by providing triplets: [(a<sub>1</sub>, b<sub>1</sub>, c<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>, c<sub>n</sub>)] 
-
-The entry for c<sub>i</sub> are so-called hard-negatives: On a lexical level, they are similar to a<sub>i</sub> and b<sub>i</sub>. But on a semantic level, they mean different things and should not be close in the vector space.
+We can further improve MultipleNegativesRankingLoss by providing triplets rather than pairs: [(a<sub>1</sub>, b<sub>1</sub>, c<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>, c<sub>n</sub>)]. The samples for c<sub>i</sub> are so-called hard-negatives: On a lexical level, they are similar to a<sub>i</sub> and b<sub>i</sub>, but on a semantic level, they mean different things and should not be close to a<sub>i</sub> in the vector space.

 For NLI data, we can use the contradiction-label to create such triplets with a hard negative. So our triplets look like this:
-("*A soccer game with multiple males playing."*, *"Some men are playing a sport."*, *"A group of men playing a baseball game."*).
+("*A soccer game with multiple males playing."*, *"Some men are playing a sport."*, *"A group of men playing a baseball game."*). We want the sentences *"A soccer game with multiple males playing."* and *"Some men are playing a sport."* to be close in the vector space, while there should be a larger distance between *"A soccer game with multiple males playing."* and "*A group of men playing a baseball game."*. The [triplet subset of AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet) has been prepared in this format.
+
+### GISTEmbedLoss
+```eval_rst
+
+:class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` can be extended even further by recognizing that the in-batch negative sampling as shown in `this example <#multiplenegativesrankingloss>`_ is a bit flawed. In particular, we automatically assume that the pairs (a\ :sub:`1`\ , b\ :sub:`2`\ ), ..., (a\ :sub:`1`\ , b\ :sub:`n`\ ) are negative, but that does not strictly have to be true.

-We want the sentences *"A soccer game with multiple males playing."* and *"Some men are playing a sport."* to be close in the vector space, while there should be a larger distance between *"A soccer game with multiple males playing."* and "*A group of men playing a baseball game."*.
+To address this, :class:`~sentence_transformers.losses.GISTEmbedLoss` uses a Sentence Transformer model to guide the in-batch negative sample selection. In particular, if the guide model considers the similarity of (a\ :sub:`1`\ , b\ :sub:`n`\ ) to be larger than (a\ :sub:`1`\ , b\ :sub:`1`\ ), then the (a\ :sub:`1`\ , b\ :sub:`n`\ ) pair is considered a false negative and consequently ignored in the training process. In essence, this results in higher quality training data for the model.
+```
\ No newline at end of file
--- a/examples/training/nli/training_nli.py
+++ b/examples/training/nli/training_nli.py
@@ -10,128 +10,113 @@ OR
 python training_nli.py pretrained_transformer_model_name
 """

-from torch.utils.data import DataLoader
-import math
-from sentence_transformers import models, losses
-from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
-from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 import logging
-from datetime import datetime
 import sys
-import os
-import gzip
-import csv
-
-#### Just some code to print debug information to stdout
-logging.basicConfig(
-    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
-)
-#### /print debug information to stdout
-
-# Check if dataset exists. If not, download and extract  it
-nli_dataset_path = "data/AllNLI.tsv.gz"
-sts_dataset_path = "data/stsbenchmark.tsv.gz"
-
-if not os.path.exists(nli_dataset_path):
-    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+import traceback
+from datetime import datetime

-if not os.path.exists(sts_dataset_path):
-    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer, losses
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.similarity_functions import SimilarityFunction
+from sentence_transformers.trainer import SentenceTransformerTrainer
+from sentence_transformers.training_args import SentenceTransformerTrainingArguments

+# Set the log level to INFO to get more information
+logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

-# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+# You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
 model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
-
-# Read the dataset
 train_batch_size = 16

+output_dir = "output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

-model_save_path = (
-    "output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-)
-
-
-# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
-word_embedding_model = models.Transformer(model_name)
-
-# Apply mean pooling to get one fixed sized sentence vector
-pooling_model = models.Pooling(
-    word_embedding_model.get_word_embedding_dimension(),
-    pooling_mode_mean_tokens=True,
-    pooling_mode_cls_token=False,
-    pooling_mode_max_tokens=False,
-)

-model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
+# create one with "mean" pooling.
+model = SentenceTransformer(model_name)

-
-# Read the AllNLI.tsv.gz file and create the training dataset
+# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
+# We'll start with 10k training samples, but you can increase this to get a stronger model
 logging.info("Read AllNLI train dataset")
+train_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="train").select(range(10000))
+eval_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="dev").select(range(1000))
+logging.info(train_dataset)

-label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
-train_samples = []
-with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "train":
-            label_id = label2int[row["label"]]
-            train_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
-
-
-train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#softmaxloss
 train_loss = losses.SoftmaxLoss(
-    model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
+    model=model,
+    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
+    num_labels=3,
 )

-
-# Read STSbenchmark dataset and use it as development set
-logging.info("Read STSbenchmark dev dataset")
-dev_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "dev":
-            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
-
-dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
-    dev_samples, batch_size=train_batch_size, name="sts-dev"
+# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
+stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=stsb_eval_dataset["sentence1"],
+    sentences2=stsb_eval_dataset["sentence2"],
+    scores=stsb_eval_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+logging.info("Evaluation before training:")
+dev_evaluator(model)
+
+# 5. Define the training arguments
+args = SentenceTransformerTrainingArguments(
+    # Required parameter:
+    output_dir=output_dir,
+    # Optional training parameters:
+    num_train_epochs=1,
+    per_device_train_batch_size=train_batch_size,
+    per_device_eval_batch_size=train_batch_size,
+    warmup_ratio=0.1,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    # Optional tracking/debugging parameters:
+    eval_strategy="steps",
+    eval_steps=100,
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=2,
+    logging_steps=100,
+    run_name="nli-v1",  # Will be used in W&B if `wandb` is installed
 )

-# Configure the training
-num_epochs = 1
-
-warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logging.info("Warmup-steps: {}".format(warmup_steps))
-
-
-# Train the model
-model.fit(
-    train_objectives=[(train_dataloader, train_loss)],
+# 6. Create the trainer & start training
+trainer = SentenceTransformerTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    loss=train_loss,
    evaluator=dev_evaluator,
-    epochs=num_epochs,
-    evaluation_steps=1000,
-    warmup_steps=warmup_steps,
-    output_path=model_save_path,
 )
-
-
-##############################################################################
-#
-# Load the stored model and evaluate its performance on STS benchmark dataset
-#
-##############################################################################
-
-test_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "test":
-            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
-
-model = SentenceTransformer(model_save_path)
-test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
-    test_samples, batch_size=train_batch_size, name="sts-test"
+trainer.train()
+
+# 7. Evaluate the model performance on the STS Benchmark test dataset
+test_dataset = load_dataset("sentence-transformers/stsb", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=test_dataset["sentence1"],
+    sentences2=test_dataset["sentence2"],
+    scores=test_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
 )
-test_evaluator(model, output_path=model_save_path)
+test_evaluator(model)
+
+# 8. Save the trained & evaluated model locally
+final_output_dir = f"{output_dir}/final"
+model.save(final_output_dir)
+
+# 9. (Optional) save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-v1")
+except Exception:
+    logging.error(
+        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-v1')`."
+    )
--- a/examples/training/nli/training_nli_v2.py
+++ b/examples/training/nli/training_nli_v2.py
@@ -10,23 +10,20 @@ OR
 python training_nli_v2.py pretrained_transformer_model_name
 """

-import math
-from sentence_transformers import models, losses, datasets
-from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
-from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 import logging
-from datetime import datetime
 import sys
-import os
-import gzip
-import csv
-import random
-
-#### Just some code to print debug information to stdout
-logging.basicConfig(
-    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
-)
-#### /print debug information to stdout
+import traceback
+from datetime import datetime
+
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer, losses
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.similarity_functions import SimilarityFunction
+from sentence_transformers.trainer import SentenceTransformerTrainer
+from sentence_transformers.training_args import BatchSamplers, SentenceTransformerTrainingArguments
+
+# Set the log level to INFO to get more information
+logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

 model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
 train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
@@ -34,121 +31,94 @@ max_seq_length = 75
 num_epochs = 1

 # Save path of the model
-model_save_path = (
+output_dir = (
    "output/training_nli_v2_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 )


-# Here we define our SentenceTransformer model
-word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
-pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
-model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-
-# Check if dataset exists. If not, download and extract  it
-nli_dataset_path = "data/AllNLI.tsv.gz"
-sts_dataset_path = "data/stsbenchmark.tsv.gz"
-
-if not os.path.exists(nli_dataset_path):
-    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
+# create one with "mean" pooling.
+model = SentenceTransformer(model_name)

-if not os.path.exists(sts_dataset_path):
-    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
-
-
-# Read the AllNLI.tsv.gz file and create the training dataset
+# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
+# We'll start with 10k training samples, but you can increase this to get a stronger model
 logging.info("Read AllNLI train dataset")
+train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train").select(range(10000))
+eval_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="dev").select(range(1000))
+logging.info(train_dataset)

-
-def add_to_samples(sent1, sent2, label):
-    if sent1 not in train_data:
-        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
-    train_data[sent1][label].add(sent2)
-
-
-train_data = {}
-with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "train":
-            sent1 = row["sentence1"].strip()
-            sent2 = row["sentence2"].strip()
-
-            add_to_samples(sent1, sent2, row["label"])
-            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
-
-
-train_samples = []
-for sent1, others in train_data.items():
-    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
-        train_samples.append(
-            InputExample(
-                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
-            )
-        )
-        train_samples.append(
-            InputExample(
-                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
-            )
-        )
-
-logging.info("Train samples: {}".format(len(train_samples)))
-
-
-# Special data loader that avoid duplicates within a batch
-train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
-
-
-# Our training loss
+# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss
 train_loss = losses.MultipleNegativesRankingLoss(model)


-# Read STSbenchmark dataset and use it as development set
-logging.info("Read STSbenchmark dev dataset")
-dev_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "dev":
-            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
-
-dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
-    dev_samples, batch_size=train_batch_size, name="sts-dev"
+# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
+stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=stsb_eval_dataset["sentence1"],
+    sentences2=stsb_eval_dataset["sentence2"],
+    scores=stsb_eval_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+logging.info("Evaluation before training:")
+dev_evaluator(model)
+
+# 5. Define the training arguments
+args = SentenceTransformerTrainingArguments(
+    # Required parameter:
+    output_dir=output_dir,
+    # Optional training parameters:
+    num_train_epochs=1,
+    per_device_train_batch_size=train_batch_size,
+    per_device_eval_batch_size=train_batch_size,
+    warmup_ratio=0.1,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    batch_sampler=BatchSamplers.NO_DUPLICATES,
+    # Optional tracking/debugging parameters:
+    eval_strategy="steps",
+    eval_steps=10,
+    save_strategy="steps",
+    save_steps=10,
+    save_total_limit=2,
+    logging_steps=100,
+    run_name="nli-v2",  # Will be used in W&B if `wandb` is installed
 )

-# Configure the training
-warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logging.info("Warmup-steps: {}".format(warmup_steps))
-
-
-# Train the model
-model.fit(
-    train_objectives=[(train_dataloader, train_loss)],
+# 6. Create the trainer & start training
+trainer = SentenceTransformerTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    loss=train_loss,
    evaluator=dev_evaluator,
-    epochs=num_epochs,
-    evaluation_steps=int(len(train_dataloader) * 0.1),
-    warmup_steps=warmup_steps,
-    output_path=model_save_path,
-    use_amp=False,  # Set to True, if your GPU supports FP16 operations
 )
-
-
-##############################################################################
-#
-# Load the stored model and evaluate its performance on STS benchmark dataset
-#
-##############################################################################
-
-test_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "test":
-            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
-
-model = SentenceTransformer(model_save_path)
-test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
-    test_samples, batch_size=train_batch_size, name="sts-test"
+trainer.train()
+
+# 7. Evaluate the model performance on the STS Benchmark test dataset
+test_dataset = load_dataset("sentence-transformers/stsb", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=test_dataset["sentence1"],
+    sentences2=test_dataset["sentence2"],
+    scores=test_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
 )
-test_evaluator(model, output_path=model_save_path)
+test_evaluator(model)
+
+# 8. Save the trained & evaluated model locally
+final_output_dir = f"{output_dir}/final"
+model.save(final_output_dir)
+
+# 9. (Optional) save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-v2")
+except Exception:
+    logging.error(
+        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-v2')`."
+    )
--- a/examples/training/nli/training_nli_v3.py
+++ b/examples/training/nli/training_nli_v3.py
@@ -10,23 +10,20 @@ OR
 python training_nli_v3.py pretrained_transformer_model_name
 """

-import math
-from sentence_transformers import models, losses, datasets
-from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
-from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 import logging
-from datetime import datetime
 import sys
-import os
-import gzip
-import csv
-import random
-
-#### Just some code to print debug information to stdout
-logging.basicConfig(
-    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
-)
-#### /print debug information to stdout
+import traceback
+from datetime import datetime
+
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer, losses
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.similarity_functions import SimilarityFunction
+from sentence_transformers.trainer import SentenceTransformerTrainer
+from sentence_transformers.training_args import BatchSamplers, SentenceTransformerTrainingArguments
+
+# Set the log level to INFO to get more information
+logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

 model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
 train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
@@ -34,124 +31,95 @@ max_seq_length = 75
 num_epochs = 1

 # Save path of the model
-model_save_path = (
+output_dir = (
    "output/training_nli_v3_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 )


-# Here we define our SentenceTransformer model
-word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
-pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
-model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-
-# Check if dataset exists. If not, download and extract  it
-nli_dataset_path = "data/AllNLI.tsv.gz"
-sts_dataset_path = "data/stsbenchmark.tsv.gz"
-
-if not os.path.exists(nli_dataset_path):
-    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
-
-if not os.path.exists(sts_dataset_path):
-    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
+# create one with "mean" pooling.
+model = SentenceTransformer(model_name)

-
-# Read the AllNLI.tsv.gz file and create the training dataset
+# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
+# We'll start with 10k training samples, but you can increase this to get a stronger model
 logging.info("Read AllNLI train dataset")
+train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train").select(range(10000))
+eval_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="dev").select(range(1000))
+logging.info(train_dataset)

-
-def add_to_samples(sent1, sent2, label):
-    if sent1 not in train_data:
-        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
-    train_data[sent1][label].add(sent2)
-
-
-train_data = {}
-with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "train":
-            sent1 = row["sentence1"].strip()
-            sent2 = row["sentence2"].strip()
-
-            add_to_samples(sent1, sent2, row["label"])
-            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
-
-
-train_samples = []
-for sent1, others in train_data.items():
-    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
-        train_samples.append(
-            InputExample(
-                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
-            )
-        )
-        train_samples.append(
-            InputExample(
-                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
-            )
-        )
-
-logging.info("Train samples: {}".format(len(train_samples)))
-
-
-# Special data loader that avoid duplicates within a batch
-train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
-
-
+# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#gistembedloss
 # The guiding model
 guide_model = SentenceTransformer("all-MiniLM-L6-v2")
-
-# Our training loss
 train_loss = losses.GISTEmbedLoss(model, guide_model)

-
-# Read STSbenchmark dataset and use it as development set
-logging.info("Read STSbenchmark dev dataset")
-dev_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "dev":
-            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
-
-dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
-    dev_samples, batch_size=train_batch_size, name="sts-dev"
+# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
+stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=stsb_eval_dataset["sentence1"],
+    sentences2=stsb_eval_dataset["sentence2"],
+    scores=stsb_eval_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+logging.info("Evaluation before training:")
+dev_evaluator(model)
+
+# 5. Define the training arguments
+args = SentenceTransformerTrainingArguments(
+    # Required parameter:
+    output_dir=output_dir,
+    # Optional training parameters:
+    num_train_epochs=1,
+    per_device_train_batch_size=train_batch_size,
+    per_device_eval_batch_size=train_batch_size,
+    warmup_ratio=0.1,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    batch_sampler=BatchSamplers.NO_DUPLICATES,
+    # Optional tracking/debugging parameters:
+    eval_strategy="steps",
+    eval_steps=10,
+    save_strategy="steps",
+    save_steps=10,
+    save_total_limit=2,
+    logging_steps=100,
+    run_name="nli-v3",  # Will be used in W&B if `wandb` is installed
 )

-# Configure the training
-warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logging.info("Warmup-steps: {}".format(warmup_steps))
-
-
-# Train the model
-model.fit(
-    train_objectives=[(train_dataloader, train_loss)],
+# 6. Create the trainer & start training
+trainer = SentenceTransformerTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    loss=train_loss,
    evaluator=dev_evaluator,
-    epochs=num_epochs,
-    evaluation_steps=int(len(train_dataloader) * 0.1),
-    warmup_steps=warmup_steps,
-    output_path=model_save_path,
-    use_amp=False,  # Set to True, if your GPU supports FP16 operations
 )
-
-
-##############################################################################
-#
-# Load the stored model and evaluate its performance on STS benchmark dataset
-#
-##############################################################################
-
-test_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "test":
-            score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
-
-model = SentenceTransformer(model_save_path)
-test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
-    test_samples, batch_size=train_batch_size, name="sts-test"
+trainer.train()
+
+# 7. Evaluate the model performance on the STS Benchmark test dataset
+test_dataset = load_dataset("sentence-transformers/stsb", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=test_dataset["sentence1"],
+    sentences2=test_dataset["sentence2"],
+    scores=test_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
 )
-test_evaluator(model, output_path=model_save_path)
+test_evaluator(model)
+
+# 8. Save the trained & evaluated model locally
+final_output_dir = f"{output_dir}/final"
+model.save(final_output_dir)
+
+# 9. (Optional) save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-v3")
+except Exception:
+    logging.error(
+        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-v3')`."
+    )
--- a/examples/training/other/training_batch_hard_trec.py
+++ b/examples/training/other/training_batch_hard_trec.py
@@ -16,18 +16,18 @@ which sentence with another label is the closest (hard negative example). It the
 all sentences with the same label should be close and sentences for different labels should be clearly separated.
 """

-from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
-from sentence_transformers.datasets import SentenceLabelDataset
-from torch.utils.data import DataLoader
-from sentence_transformers.readers import InputExample
-from sentence_transformers.evaluation import TripletEvaluator
-from datetime import datetime
-
-
 import logging
 import os
 import random
 from collections import defaultdict
+from datetime import datetime
+
+from torch.utils.data import DataLoader
+
+from sentence_transformers import LoggingHandler, SentenceTransformer, losses, util
+from sentence_transformers.datasets import SentenceLabelDataset
+from sentence_transformers.evaluation import TripletEvaluator
+from sentence_transformers.readers import InputExample

 logging.basicConfig(
    format="%(asctime)s - %(message)s",

--- a/examples/training/other/training_multi-task.py
+++ b/examples/training/other/training_multi-task.py
@@ -4,126 +4,130 @@ This is an example how to train SentenceTransformers in a multi-task setup.
 The system trains BERT on the AllNLI and on the STSbenchmark dataset.
 """

-from torch.utils.data import DataLoader
-import math
-from sentence_transformers import models, losses
-from sentence_transformers import LoggingHandler, SentenceTransformer, util
-from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
-from sentence_transformers.readers import InputExample
 import logging
+import traceback
 from datetime import datetime
-import gzip
-import csv
-import os

-#### Just some code to print debug information to stdout
-logging.basicConfig(
-    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
-)
-#### /print debug information to stdout
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.losses import CosineSimilarityLoss, SoftmaxLoss
+from sentence_transformers.similarity_functions import SimilarityFunction
+from sentence_transformers.trainer import SentenceTransformerTrainer
+from sentence_transformers.training_args import MultiDatasetBatchSamplers, SentenceTransformerTrainingArguments
+
+# Set the log level to INFO to get more information
+logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

 # Read the dataset
 model_name = "bert-base-uncased"
+num_train_epochs = 1
 batch_size = 16
-model_save_path = "output/training_multi-task_" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-
-
-# Check if dataset exists. If not, download and extract  it
-nli_dataset_path = "datasets/AllNLI.tsv.gz"
-sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
-
-if not os.path.exists(nli_dataset_path):
-    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
-
-if not os.path.exists(sts_dataset_path):
-    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
-
-
-# Use BERT for mapping tokens to embeddings
-word_embedding_model = models.Transformer(model_name)
-
-# Apply mean pooling to get one fixed sized sentence vector
-pooling_model = models.Pooling(
-    word_embedding_model.get_word_embedding_dimension(),
-    pooling_mode_mean_tokens=True,
-    pooling_mode_cls_token=False,
-    pooling_mode_max_tokens=False,
+output_dir = "output/training_multi-task_" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
+# create one with "mean" pooling.
+model = SentenceTransformer(model_name)
+# If we want, we can limit the maximum sequence length for the model
+# model.max_seq_length = 75
+logging.info(model)
+
+# 2a. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
+nli_train_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="train")
+nli_eval_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="dev").select(range(1000))
+logging.info(nli_train_dataset)
+
+# 2b. Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
+stsb_train_dataset = load_dataset("sentence-transformers/stsb", split="train")
+stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
+stsb_test_dataset = load_dataset("sentence-transformers/stsb", split="test")
+logging.info(stsb_train_dataset)
+
+# 3. Define our training losses
+# 3a. SoftmaxLoss for the NLI data (sentence_A, sentence_B, class), see also https://sbert.net/docs/training/loss_overview.html
+train_loss_nli = SoftmaxLoss(
+    model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3
 )
-
-model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-
-
-# Convert the dataset to a DataLoader ready for training
-logging.info("Read AllNLI train dataset")
-label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
-train_nli_samples = []
-with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        if row["split"] == "train":
-            label_id = label2int[row["label"]]
-            train_nli_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
-
-
-train_dataloader_nli = DataLoader(train_nli_samples, shuffle=True, batch_size=batch_size)
-train_loss_nli = losses.SoftmaxLoss(
-    model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
+# 3b. CosineSimilarityLoss for the STSB data (sentence_A, sentence_B, similarity score between 0 and 1)
+train_loss_sts = CosineSimilarityLoss(model=model)
+
+# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=stsb_eval_dataset["sentence1"],
+    sentences2=stsb_eval_dataset["sentence2"],
+    scores=stsb_eval_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
 )

-logging.info("Read STSbenchmark train dataset")
-train_sts_samples = []
-dev_sts_samples = []
-test_sts_samples = []
-with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
-    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
-    for row in reader:
-        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
-        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
-
-        if row["split"] == "dev":
-            dev_sts_samples.append(inp_example)
-        elif row["split"] == "test":
-            test_sts_samples.append(inp_example)
-        else:
-            train_sts_samples.append(inp_example)
-
-
-train_dataloader_sts = DataLoader(train_sts_samples, shuffle=True, batch_size=batch_size)
-train_loss_sts = losses.CosineSimilarityLoss(model=model)
-
-
-logging.info("Read STSbenchmark dev dataset")
-evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_sts_samples, name="sts-dev")
-
-# Configure the training
-num_epochs = 4
-
-warmup_steps = math.ceil(len(train_dataloader_sts) * num_epochs * 0.1)  # 10% of train data for warm-up
-logging.info("Warmup-steps: {}".format(warmup_steps))
-
-
-# Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data)
-# and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data)
-# You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way.
-train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)]
-
-# Train the model
-model.fit(
-    train_objectives=train_objectives,
-    evaluator=evaluator,
-    epochs=num_epochs,
-    evaluation_steps=1000,
-    warmup_steps=warmup_steps,
-    output_path=model_save_path,
+# 5. Define the training arguments
+args = SentenceTransformerTrainingArguments(
+    # Required parameter:
+    output_dir=output_dir,
+    # Optional training parameters:
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    warmup_ratio=0.1,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    # With ROUND_ROBIN you'll sample the same amount from each dataset, until one of the multi-datasets is exhausted
+    # The alternative is PROPORTIONAL, which samples from each dataset in proportion to the dataset size,
+    # but that will lead to a lot of samples from the larger dataset (AllNLI in this case)
+    multi_dataset_batch_sampler=MultiDatasetBatchSamplers.ROUND_ROBIN,
+    # Optional tracking/debugging parameters:
+    eval_strategy="steps",
+    eval_steps=100,
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=2,
+    logging_steps=100,
+    run_name="multi-task",  # Will be used in W&B if `wandb` is installed
 )

+# 6. Create the trainer & start training
+trainer = SentenceTransformerTrainer(
+    model=model,
+    args=args,
+    train_dataset={
+        "all-nli": nli_train_dataset,
+        "sts": stsb_train_dataset,
+    },
+    eval_dataset={
+        "all-nli": nli_eval_dataset,
+        "sts": stsb_eval_dataset,
+    },
+    loss={
+        "all-nli": train_loss_nli,
+        "sts": train_loss_sts,
+    },
+    evaluator=dev_evaluator,
+)
+trainer.train()

-##############################################################################
-#
-# Load the stored model and evaluate its performance on STS benchmark dataset
-#
-##############################################################################

-model = SentenceTransformer(model_save_path)
-test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_sts_samples, name="sts-test")
-test_evaluator(model, output_path=model_save_path)
+# 7. Evaluate the model performance on the STS Benchmark test dataset
+test_evaluator = EmbeddingSimilarityEvaluator(
+    sentences1=stsb_test_dataset["sentence1"],
+    sentences2=stsb_test_dataset["sentence2"],
+    scores=stsb_test_dataset["score"],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
+)
+test_evaluator(model)
+
+# 8. Save the trained & evaluated model locally
+final_output_dir = f"{output_dir}/final"
+model.save(final_output_dir)
+
+# 9. (Optional) save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-multi-task")
+except Exception:
+    logging.error(
+        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-multi-task')`."
+    )
--- a/examples/training/other/training_wikipedia_sections.py
+++ b/examples/training/other/training_wikipedia_sections.py
@@ -4,106 +4,110 @@ This script trains sentence transformers with a triplet loss function.
 As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
 """

-from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
-from torch.utils.data import DataLoader
-from sentence_transformers.evaluation import TripletEvaluator
-from datetime import datetime
-from zipfile import ZipFile
-
-import csv
 import logging
-import os
+import traceback
+from datetime import datetime

+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.evaluation import TripletEvaluator
+from sentence_transformers.losses import TripletLoss
+from sentence_transformers.trainer import SentenceTransformerTrainer
+from sentence_transformers.training_args import SentenceTransformerTrainingArguments

-logging.basicConfig(
-    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
-)
-logger = logging.getLogger(__name__)
+# Set the log level to INFO to get more information
+logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

 # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
 model_name = "distilbert-base-uncased"
+batch_size = 16
+num_train_epochs = 1

-dataset_path = "datasets/wikipedia-sections"
-if not os.path.exists(dataset_path):
-    os.makedirs(dataset_path, exist_ok=True)
-    filepath = os.path.join(dataset_path, "wikipedia-sections-triplets.zip")
-    util.http_get("https://sbert.net/datasets/wikipedia-sections-triplets.zip", filepath)
-    with ZipFile(filepath, "r") as zip:
-        zip.extractall(dataset_path)
-
+output_dir = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

-### Create a torch.DataLoader that passes training batch instances to our model
-train_batch_size = 16
-output_path = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-num_epochs = 1
+# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
+# create one with "mean" pooling.
+model = SentenceTransformer(model_name)
+# If we want, we can limit the maximum sequence length for the model
+# model.max_seq_length = 75
+logging.info(model)

-
-### Configure sentence transformers for training and train on the provided dataset
-# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
-word_embedding_model = models.Transformer(model_name)
-
-# Apply mean pooling to get one fixed sized sentence vector
-pooling_model = models.Pooling(
-    word_embedding_model.get_word_embedding_dimension(),
-    pooling_mode_mean_tokens=True,
-    pooling_mode_cls_token=False,
-    pooling_mode_max_tokens=False,
+# 2. Load the Wikipedia-Sections dataset: https://huggingface.co/datasets/sentence-transformers/wikipedia-sections
+train_dataset = load_dataset("sentence-transformers/wikipedia-sections", "triplet", split="train").select(
+    range(10_000)
 )
-
-model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-
-
-logger.info("Read Triplet train dataset")
-train_examples = []
-with open(os.path.join(dataset_path, "train.csv"), encoding="utf-8") as fIn:
-    reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
-    for row in reader:
-        train_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]], label=0))
-
-
-train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
-train_loss = losses.TripletLoss(model=model)
-
-logger.info("Read Wikipedia Triplet dev dataset")
-dev_examples = []
-with open(os.path.join(dataset_path, "validation.csv"), encoding="utf-8") as fIn:
-    reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
-    for row in reader:
-        dev_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
-
-        if len(dev_examples) >= 1000:
-            break
-
-evaluator = TripletEvaluator.from_input_examples(dev_examples, name="dev")
-
-
-warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data
-
-
-# Train the model
-model.fit(
-    train_objectives=[(train_dataloader, train_loss)],
-    evaluator=evaluator,
-    epochs=num_epochs,
-    evaluation_steps=1000,
-    warmup_steps=warmup_steps,
-    output_path=output_path,
+eval_dataset = load_dataset("sentence-transformers/wikipedia-sections", "triplet", split="validation").select(
+    range(1000)
+)
+test_dataset = load_dataset("sentence-transformers/wikipedia-sections", "triplet", split="test").select(range(1000))
+logging.info(train_dataset)
+
+# 3. Define our training loss
+# TripletLoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#tripletloss) needs three text columns
+train_loss = TripletLoss(model)
+
+# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
+dev_evaluator = TripletEvaluator(
+    anchors=eval_dataset[:1000]["anchor"],
+    positives=eval_dataset[:1000]["positive"],
+    negatives=eval_dataset[:1000]["negative"],
+    name="wikipedia-sections-dev",
 )

-##############################################################################
-#
-# Load the stored model and evaluate its performance on STS benchmark dataset
-#
-##############################################################################
+# 5. Define the training arguments
+args = SentenceTransformerTrainingArguments(
+    # Required parameter:
+    output_dir=output_dir,
+    # Optional training parameters:
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    warmup_ratio=0.1,
+    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
+    bf16=False,  # Set to True if you have a GPU that supports BF16
+    # Optional tracking/debugging parameters:
+    eval_strategy="steps",
+    eval_steps=100,
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=2,
+    logging_steps=100,
+    run_name="wikipedia-sections-triplet",  # Will be used in W&B if `wandb` is installed
+)

-logger.info("Read test examples")
-test_examples = []
-with open(os.path.join(dataset_path, "test.csv"), encoding="utf-8") as fIn:
-    reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
-    for row in reader:
-        test_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
+# 6. Create the trainer & start training
+trainer = SentenceTransformerTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    loss=train_loss,
+    evaluator=dev_evaluator,
+)
+trainer.train()


-model = SentenceTransformer(output_path)
-test_evaluator = TripletEvaluator.from_input_examples(test_examples, name="test")
-test_evaluator(model, output_path=output_path)
+# 7. Evaluate the model performance on the STS Benchmark test dataset
+test_evaluator = TripletEvaluator(
+    anchors=test_dataset["anchor"],
+    positives=test_dataset["positive"],
+    negatives=test_dataset["negative"],
+    name="wikipedia-sections-test",
+)
+test_evaluator(model)
+
+# 8. Save the trained & evaluated model locally
+final_output_dir = f"{output_dir}/final"
+model.save(final_output_dir)
+
+# 9. (Optional) save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-wikipedia-sections-triplet")
+except Exception:
+    logging.error(
+        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-wikipedia-sections-triplet')`."
+    )