Commit b0f4f53a authored by Rayyyyy's avatar Rayyyyy
Browse files

Update version according to github

parent 392df446
......@@ -14,14 +14,16 @@ python eval_cross-encoder-trec-dl.py cross-encoder-model-name
"""
import gzip
from collections import defaultdict
import logging
import tqdm
import numpy as np
import os
import sys
from collections import defaultdict
import numpy as np
import pytrec_eval
from sentence_transformers import util, CrossEncoder
import os
import tqdm
from sentence_transformers import CrossEncoder, util
data_folder = "trec2019-data"
os.makedirs(data_folder, exist_ok=True)
......@@ -32,7 +34,7 @@ queries_filepath = os.path.join(data_folder, "msmarco-test2019-queries.tsv.gz")
if not os.path.exists(queries_filepath):
logging.info("Download " + os.path.basename(queries_filepath))
util.http_get(
"https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", queries_filepath
"https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", queries_filepath
)
with gzip.open(queries_filepath, "rt", encoding="utf8") as fIn:
......@@ -69,7 +71,8 @@ passage_filepath = os.path.join(data_folder, "msmarco-passagetest2019-top1000.ts
if not os.path.exists(passage_filepath):
logging.info("Download " + os.path.basename(passage_filepath))
util.http_get(
"https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz", passage_filepath
"https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz",
passage_filepath,
)
......
......@@ -6,12 +6,13 @@ Usage:
python eval_msmarco.py model_name [max_corpus_size_in_thousands]
"""
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation, util
import logging
import sys
import os
import sys
import tarfile
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation, util
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
......@@ -42,14 +43,16 @@ if not os.path.exists(collection_filepath) or not os.path.exists(dev_queries_fil
tar_filepath = os.path.join(data_folder, "collectionandqueries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download: " + tar_filepath)
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", tar_filepath)
util.http_get(
"https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", tar_filepath
)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
if not os.path.exists(qrels_filepath):
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv", qrels_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv", qrels_filepath)
### Load data
......
......@@ -8,12 +8,14 @@ Usage:
python translate_queries [target_language]
"""
import os
from sentence_transformers import LoggingHandler, util
import logging
import os
import sys
import tarfile
from easynmt import EasyNMT
import sys
from sentence_transformers import LoggingHandler, util
#### Just some code to print debug information to stdout
logging.basicConfig(
......@@ -44,7 +46,7 @@ os.makedirs(data_folder, exist_ok=True)
train_queries = {}
qrels_train = os.path.join(data_folder, "qrels.train.tsv")
if not os.path.exists(qrels_train):
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv", qrels_train)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.train.tsv", qrels_train)
with open(qrels_train) as fIn:
for line in fIn:
......@@ -58,7 +60,7 @@ if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......
import sys
import argparse
import gzip
import json
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
from torch.utils.data import Dataset
import pickle
import random
import sys
import tarfile
from datetime import datetime
from shutil import copyfile
import pickle
import argparse
import tqdm
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util
#### Just some code to print debug information to stdout
logging.basicConfig(
......@@ -89,7 +90,7 @@ if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......@@ -109,7 +110,7 @@ if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......@@ -165,7 +166,7 @@ with gzip.open(hard_negatives_filepath, "rt") as fIn:
negs_to_use = args.negs_to_use.split(",")
else: # Use all systems
negs_to_use = list(data["neg"].keys())
logging.info("Using negatives from the following systems:", negs_to_use)
logging.info("Using negatives from the following systems: {}".format(", ".join(negs_to_use)))
for system_name in negs_to_use:
if system_name not in data["neg"]:
......
......@@ -17,19 +17,20 @@ Running this script:
python train_bi-encoder-v3.py
"""
import argparse
import gzip
import json
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import pickle
import random
import tarfile
from datetime import datetime
import tqdm
from torch.utils.data import Dataset
import random
import pickle
import argparse
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util
#### Just some code to print debug information to stdout
logging.basicConfig(
......@@ -99,7 +100,7 @@ if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......@@ -119,7 +120,7 @@ if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......
......@@ -17,17 +17,18 @@ Running this script:
python train_cross-encoder-v2.py
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import gzip
import logging
import os
import tarfile
from datetime import datetime
import torch
from torch.utils.data import DataLoader
from sentence_transformers import InputExample, LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
#### Just some code to print debug information to stdout
logging.basicConfig(
......@@ -64,7 +65,7 @@ if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......@@ -82,7 +83,7 @@ if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......
......@@ -14,17 +14,18 @@ Running this script:
python train_cross-encoder.py
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import gzip
import logging
import os
import tarfile
from datetime import datetime
import tqdm
from torch.utils.data import DataLoader
from sentence_transformers import InputExample, LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
#### Just some code to print debug information to stdout
logging.basicConfig(
......@@ -71,7 +72,7 @@ if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......@@ -89,7 +90,7 @@ if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
util.http_get("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
......
# Multilingual-Models
# Multilingual Models
The issue with multilingual BERT (mBERT) as well as with XLM-RoBERTa is that those produce rather bad sentence representation out-of-the-box. Further, the vectors spaces between languages are not aligned, i.e., the sentences with the same content in different languages would be mapped to different locations in the vector space.
In my publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) I describe any easy approach to extend sentence embeddings to further languages.
In my publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) I describe an easy approach to extend sentence embeddings to further languages.
Chien Vu also wrote a nice blog article on this technique: [A complete guide to transfer learning from English to other Languages using Sentence Embeddings BERT Models](https://towardsdatascience.com/a-complete-guide-to-transfer-learning-from-english-to-other-languages-using-sentence-embeddings-8c427f8804a9)
## Available Pre-trained Models
For a list of available models, see [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).
## Extend your own models
![Multilingual Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/multilingual-distillation.png)
The idea is based on a fixed (monolingual) **teacher model** that produces sentence embeddings with our desired properties in one language (e.g. English). The **student model** is supposed to mimic the teacher model, i.e., the same English sentence should be mapped to the same vector by the teacher and by the student model. Additionally, in order to make the student model work for other languages, we train the student model on parallel (translated) sentences. The translation of each sentence should also be mapped to the same vector as the original sentence.
In the above figure, the student model should map *Hello World* and the German translation *Hallo Welt* to the vector of ``teacher_model('Hello World')``. We achieve this by training the student model using mean squared error (MSE) loss.
In our experiments we initialized the student model with the multilingual [XLM-RoBERTa model](https://huggingface.co/FacebookAI/xlm-roberta-base).
## Training
For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py).
This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages.
## Datasets
```eval_rst
As training data we require parallel sentences, i.e., sentences translated in various languages. In particular, we will use :class:`~datasets.Dataset` instances with ``"english"`` and ``"non_english"`` columns. We have prepared a large collection of such datasets in our `Parallel Sentences dataset collection <https://huggingface.co/collections/sentence-transformers/parallel-sentences-datasets-6644d644123d31ba5b1c8785>`_.
```
The training script will take the `"english"` column and add a `"label"` column containing the embeddings of the english texts. Then, the student model `"english"` and `"non_english"` will be trained to be similar to this `"label"`. You can load such a training dataset like so:
```python
from datasets import load_dataset
train_dataset = load_dataset("sentence-transformers/parallel-sentences-talks", "en-de", split="train")
print(train_dataset[0])
# {"english": "So I think practicality is one case where it's worth teaching people by hand.", "non_english": "Ich denke, dass es sich aus diesem Grund lohnt, den Leuten das Rechnen von Hand beizubringen."}
```
## Sources for Training Data
A great website for a vast number of parallel (translated) datasets is [OPUS](http://opus.nlpl.eu/). There, you find parallel datasets for more than 400 languages. You can use these to create your own parallel sentence datasets, if you wish.
## Evaluation
Training can be evaluated in different ways. For an example how to use these evaluation methods, see [make_multilingual.py](make_multilingual.py).
### MSE Evaluation
You can measure the mean squared error (MSE) between the student embeddings and teacher embeddings.
```python
from datasets import load_dataset
eval_dataset = load_dataset("sentence-transformers/parallel-sentences-talks", "en-fr", split="dev")
dev_mse = MSEEvaluator(
source_sentences=eval_dataset["english"],
target_sentences=eval_dataset["non_english"],
name="en-fr-dev",
teacher_model=teacher_model,
batch_size=32,
)
```
This evaluator computes the teacher embeddings for the `source_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `target_sentences`, for example, for French. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance.
### Translation Accuracy
You can also measure the translation accuracy. As inputs, this evaluator accepts a list of `source_sentences` (e.g. English), and a list of `target_sentences` (e.g. Spanish), such that `target_sentences[i]` is a translation of `source_sentences[i]`.
For each sentence pair, we check if `source_sentences[i]` we check if `target_sentences[i]` has the highest similarity out of all target sentences. If this is the case, we have a hit, otherwise an error. This evaluator reports accuracy (higher = better).
```python
from datasets import load_dataset
eval_dataset = load_dataset("sentence-transformers/parallel-sentences-talks", "en-fr", split="dev")
dev_trans_acc = TranslationEvaluator(
source_sentences=eval_dataset["english"],
target_sentences=eval_dataset["non_english"],
name="en-fr-dev",
batch_size=32,
)
```
### Multilingual Semantic Textual Similarity
You can also measure the semantic textual similarity (STS) between sentence pairs in different languages:
```python
from datasets import load_dataset
test_dataset = load_dataset("mteb/sts17-crosslingual-sts", "nl-en", split="test")
test_emb_similarity = EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=[score / 5.0 for score in test_dataset["score"]], # Convert 0-5 scores to 0-1 scores
batch_size=32,
name=f"sts17-nl-en-test",
show_progress_bar=False,
)
```
Where `sentences1` and `sentences2` are lists of sentences and score is numeric value indicating the semantic similarity between `sentences1[i]` and `sentences2[i]`.
## Available Pre-trained Models
For a list of available models, see [Pretrained Models](../../../docs/sentence_transformer/pretrained_models.html#multilingual-models).
## Usage
You can use the models in the following way:
```python
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("model-name")
embeddings = embedder.encode(["Hello World", "Hallo Welt", "Hola mundo"])
print(embeddings)
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = model.encode(["Hello World", "Hallo Welt", "Hola mundo", "Bye, Moon!"])
similarities = model.similarity(embeddings, embeddings)
# tensor([[1.0000, 0.9429, 0.8880, 0.4558],
# [0.9429, 1.0000, 0.9680, 0.5307],
# [0.8880, 0.9680, 1.0000, 0.4933],
# [0.4558, 0.5307, 0.4933, 1.0000]])
```
## Performance
The performance was evaluated on the [Semantic Textual Similarity (STS) 2017 dataset](http://ixa2.si.ehu.es/stswiki/index.php/Main_Page). The task is to predict the semantic similarity (on a scale 0-5) of two given sentences. STS2017 has monolingual test data for English, Arabic, and Spanish, and cross-lingual test data for English-Arabic, -Spanish and -Turkish.
......@@ -101,105 +198,6 @@ We extended the STS2017 and added cross-lingual test data for English-German, Fr
</tr>
</table>
## Extend your own models
![Multilingual Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/multilingual-distillation.png)
The idea is based on a fixed (monolingual) **teacher model**, that produces sentence embeddings with our desired properties in one language. The **student model** is supposed to mimic the teacher model, i.e., the same English sentence should be mapped to the same vector by the teacher and by the student model. In order that the student model works for further languages, we train the student model on parallel (translated) sentences. The translation of each sentence should also be mapped to the same vector as the original sentence.
In the above figure, the student model should map *Hello World* and the German translation *Hallo Welt* to the vector of *teacher_model('Hello World')*. We achieve this by training the student model using mean squared error (MSE) loss.
In our experiments we initialized the student model with the multilingual XLM-RoBERTa model.
## Training
For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py).
This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages.
## Data Format
As training data we require parallel sentences, i.e., sentences translated in various languages. As data format, we use a tab-separated .tsv file. In the first column, you have your source sentence, for example, an English sentence. In the following columns, you have the translations of this source sentence. If you have multiple translations per source sentence, you can put them in the same line or in different lines.
```
Source_sentence Target_lang1 Target_lang2 Target_lang3
Source_sentence Target_lang1 Target_lang2
```
An example file could look like this (EN DE ES):
```
Hello World Hallo Welt Hola Mundo
Sentences are separated with a tab character. Die Sätze sind per Tab getrennt. Las oraciones se separan con un carácter de tabulación.
```
The order of the translations are not important, it is only important that the first column contains a sentence in a language that is understood by the teacher model.
## Loading Training Datasets
You can load such a training file using the *ParallelSentencesDataset* class:
```python
from sentence_transformers.datasets import ParallelSentencesDataset
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model)
train_data.load_data("path/to/tab/separated/train-en-de.tsv")
train_data.load_data("path/to/tab/separated/train-en-es.tsv.gz")
train_data.load_data("path/to/tab/separated/train-en-fr.tsv.gz")
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)
```
You load a file with the *load_data()* method. You can load multiple files by calling load_data multiple times. You can also regular files or .gz-compressed files.
Per default, all datasets are weighted equally. In the above example a (source, translation)-pair will be sampled equally from all three datasets. If you pass a `weight` parameter (integer), you can weight some datasets higher or lower.
## Sources for Training Data
A great website for a vast number of parallel (translated) datasets is [OPUS](http://opus.nlpl.eu/). There, you find parallel datasets for more than 400 languages.
The [examples/training/multilingual](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/) folder contains some scripts that downloads parallel training data and brings it into the right format:
- [get_parallel_data_opus.py](get_parallel_data_opus.py): This script downloads data from the [OPUS](http://opus.nlpl.eu/) website.
- [get_parallel_data_tatoeba.py](get_parallel_data_tatoeba.py): This script downloads data from the [Tatoeba](https://tatoeba.org/) website, a website for language learners with example sentences for more than many languages.
- [get_parallel_data_talks.py](get_parallel_data_talks.py): This script downloads data the parallel sentences corpus, which contains transcripts and translations of more than 4,000 talks in 100+ languages.
## Evaluation
Training can be evaluated in different ways. For an example how to use these evaluation methods, see [make_multilingual.py](make_multilingual.py).
### MSE Evaluation
You can measure the mean squared error (MSE) between the student embeddings and teacher embeddings. This can be achieved with the ``
```python
# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model)
```
This evaluator computes the teacher embeddings for the `src_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `trg_sentences`, for example, for Spanish. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance.
### Translation Accuracy
You can also measure the translation accuracy. Given a list with source sentences, for example, 1000 English sentences. And a list with matching target (translated) sentences, for example, 1000 Spanish sentences.
For each sentence pair, we check if their embeddings are the closest using cosine similarity. I.e., for each `src_sentences[i]` we check if `trg_sentences[i]` has the highest similarity out of all target sentences. If this is the case, we have a hit, otherwise an error. This evaluator reports accuracy (higher = better).
```python
# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
dev_trans_acc = evaluation.TranslationEvaluator(
src_sentences,
trg_sentences,
name=os.path.basename(dev_file),
batch_size=inference_batch_size,
)
```
### Multi-Lingual Semantic Textual Similarity
You can also measure the semantic textual similarity (STS) between sentence pairs in different languages:
```python
sts_evaluator = evaluation.EmbeddingSimilarityEvaluatorFromList(sentences1, sentences2, scores)
```
Where `sentences1` and `sentences2` are lists of sentences and score is numeric value indicating the semantic similarity between `sentences1[i]` and `sentences2[i]`.
## Citation
If you use the code for multilingual models, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813):
```
......
......@@ -29,9 +29,9 @@ This python code automates the download and creation of the parallel sentences f
"""
from opustools import OpusRead
import os
from opustools import OpusRead
corpora = ["JW300"] # Corpora you want to use
source_languages = ["en"] # Source language, our teacher model is able to understand
......
......@@ -6,19 +6,20 @@ The parallel sentences corpus is a crawl of transcripts from talks, which are tr
The parallel sentences corpus cannot be downloaded automatically. It is available for research purposes only (CC-BY-NC).
The training procedure can be found in the files make_multilingual.py and make_multilingual_sys.py.
The training procedure can be found in the files make_multilingual.py.
Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
import os
import sentence_transformers.util
import gzip
import csv
import gzip
import os
from tqdm.autonotebook import tqdm
import sentence_transformers.util
source_languages = set(["en"]) # Languages our (monolingual) teacher model understands
target_languages = set(["de", "es", "it", "fr", "ar", "tr"]) # New languages we want to extend to
......
......@@ -5,10 +5,11 @@ It is available for more than 300 languages.
This script downloads the Tatoeba corpus and extracts the sentences & translations in the languages you like
"""
import gzip
import os
import sentence_transformers
import tarfile
import gzip
import sentence_transformers
# Note: Tatoeba uses 3 letter languages codes (ISO-639-2),
# while other datasets like OPUS use 2 letter language codes (ISO-639-1)
......
......@@ -9,10 +9,10 @@ Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
import os
import sentence_transformers.util
import gzip
import os
import sentence_transformers.util
source_languages = set(["en"]) # Languages our (monolingual) teacher model understands
target_languages = set(["de", "es", "it", "fr", "ar", "tr"]) # New languages we want to extend to
......
......@@ -17,20 +17,23 @@ Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
import logging
import traceback
from datetime import datetime
import os
import logging
import sentence_transformers.util
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import zipfile
import io
from datasets import DatasetDict, load_dataset
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import (
EmbeddingSimilarityEvaluator,
MSEEvaluator,
SequentialEvaluator,
TranslationEvaluator,
)
from sentence_transformers.losses import MSELoss
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
......@@ -38,220 +41,205 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
teacher_model_name = (
"paraphrase-distilroberta-base-v2" # Our monolingual teacher model, we want to convert to multiple languages
)
student_model_name = "xlm-roberta-base" # Multilingual base model we use to imitate the teacher model
# The teacher model is monolingual, we use it for English embeddings
teacher_model_name = "paraphrase-distilroberta-base-v2"
# The student model is multilingual, we train it such that embeddings of non-English texts mimic the teacher model's English embeddings
student_model_name = "xlm-roberta-base"
max_seq_length = 128 # Student model max. lengths for inputs (number of word pieces)
student_max_seq_length = 128 # Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64 # Batch size for training
inference_batch_size = 64 # Batch size at inference
max_sentences_per_language = 500000 # Maximum number of parallel sentences for training
train_max_sentence_length = 250 # Maximum length (characters) for parallel training sentences
num_epochs = 5 # Train for x epochs
num_warmup_steps = 10000 # Warumup steps
num_evaluation_steps = 1000 # Evaluate performance after every xxxx steps
dev_sentences = 1000 # Number of parallel sentences to be used for development
num_train_epochs = 5 # Train for x epochs
num_evaluation_steps = 5000 # Evaluate performance after every xxxx steps
# Define the language codes you would like to extend the model to
source_languages = set(["en"]) # Our teacher model accepts English (en) sentences
target_languages = set(
["de", "es", "it", "fr", "ar", "tr"]
) # We want to extend the model to these new languages. For language codes, see the header of the train file
# We want to extend the model to these new languages. For language codes, see the header of the train file
target_languages = set(["de", "es", "it", "fr", "ar", "tr"])
output_path = (
output_dir = (
"output/make-multilingual-"
+ "-".join(sorted(list(source_languages)) + sorted(list(target_languages)))
+ "-"
+ datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# This function downloads a corpus if it does not exist
def download_corpora(filepaths):
if not isinstance(filepaths, list):
filepaths = [filepaths]
for filepath in filepaths:
if not os.path.exists(filepath):
print(filepath, "does not exists. Try to download from server")
filename = os.path.basename(filepath)
url = "https://sbert.net/datasets/" + filename
sentence_transformers.util.http_get(url, filepath)
# Here we define train train and dev corpora
train_corpus = "datasets/parallel-sentences.tsv.gz"
sts_corpus = "datasets/stsbenchmark.zip"
parallel_sentences_folder = "parallel-sentences/"
# Check if the file exists. If not, they are downloaded
download_corpora([train_corpus, sts_corpus])
# Create parallel files for the selected language combinations
os.makedirs(parallel_sentences_folder, exist_ok=True)
train_files = []
dev_files = []
files_to_create = []
# 1a. Here we define our SentenceTransformer teacher model.
teacher_model = SentenceTransformer(teacher_model_name)
# If we want, we can limit the maximum sequence length for the model
# teacher_model.max_seq_length = 128
logging.info(f"Teacher model: {teacher_model}")
# 1b. Here we define our SentenceTransformer student model. If not already a Sentence Transformer model,
# it will automatically create one with "mean" pooling.
student_model = SentenceTransformer(student_model_name)
# If we want, we can limit the maximum sequence length for the model
student_model.max_seq_length = student_max_seq_length
logging.info(f"Student model: {student_model}")
# 2. Load the parallel sentences training dataset: https://huggingface.co/datasets?other=sentence-transformers&sort=trending&search=parallel-sentences
# NOTE: We can also use multiple datasets if we want
dataset_to_use = "sentence-transformers/parallel-sentences-talks"
# dataset_to_use = "sentence-transformers/parallel-sentences-europarl"
# dataset_to_use = "sentence-transformers/parallel-sentences-global-voices"
# dataset_to_use = "sentence-transformers/parallel-sentences-muse"
# dataset_to_use = "sentence-transformers/parallel-sentences-jw300"
# dataset_to_use = "sentence-transformers/parallel-sentences-news-commentary"
# dataset_to_use = "sentence-transformers/parallel-sentences-opensubtitles"
# dataset_to_use = "sentence-transformers/parallel-sentences-tatoeba"
# dataset_to_use = "sentence-transformers/parallel-sentences-wikimatrix"
# dataset_to_use = "sentence-transformers/parallel-sentences-wikititles"
train_dataset_dict = DatasetDict()
eval_dataset_dict = DatasetDict()
for source_lang in source_languages:
for target_lang in target_languages:
output_filename_train = os.path.join(
parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
)
output_filename_dev = os.path.join(
parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
subset = f"{source_lang}-{target_lang}"
try:
train_dataset = load_dataset(dataset_to_use, subset, split="train")
if len(train_dataset) > max_sentences_per_language:
train_dataset = train_dataset.select(range(max_sentences_per_language))
except Exception as exc:
logging.error(f"Could not load dataset {dataset_to_use}/{source_lang}-{target_lang}: {exc}")
continue
try:
eval_dataset = load_dataset(dataset_to_use, subset, split="dev")
if len(eval_dataset) > 1000:
eval_dataset = eval_dataset.select(range(1000))
except Exception:
logging.info(
f"Could not load dataset {dataset_to_use}/{source_lang}-{target_lang} dev split, splitting 1k samples from train"
)
train_files.append(output_filename_train)
dev_files.append(output_filename_dev)
if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
files_to_create.append(
{
"src_lang": source_lang,
"trg_lang": target_lang,
"fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
"fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
"devCount": 0,
dataset = train_dataset.train_test_split(test_size=1000, shuffle=True)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
train_dataset_dict[subset] = train_dataset
eval_dataset_dict[subset] = eval_dataset
logging.info(train_dataset_dict)
# We want the student EN embeddings to be similar to the teacher EN embeddings and
# the student non-EN embeddings to be similar to the teacher EN embeddings
def prepare_dataset(batch):
return {
"english": batch["english"],
"non_english": batch["non_english"],
"label": teacher_model.encode(batch["english"], batch_size=inference_batch_size, show_progress_bar=False),
}
)
if len(files_to_create) > 0:
print(
"Parallel sentences files {} do not exist. Create these files now".format(
", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
)
)
with gzip.open(train_corpus, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for line in tqdm(reader, desc="Sentences"):
for outfile in files_to_create:
src_text = line[outfile["src_lang"]].strip()
trg_text = line[outfile["trg_lang"]].strip()
if src_text != "" and trg_text != "":
if outfile["devCount"] < dev_sentences:
outfile["devCount"] += 1
fOut = outfile["fDev"]
else:
fOut = outfile["fTrain"]
fOut.write("{}\t{}\n".format(src_text, trg_text))
for outfile in files_to_create:
outfile["fTrain"].close()
outfile["fDev"].close()
######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)
logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(
student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
column_names = list(train_dataset_dict.values())[0].column_names
train_dataset_dict = train_dataset_dict.map(
prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names
)
for train_file in train_files:
train_data.load_data(
train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length
)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)
logging.info("Prepared datasets for training:", train_dataset_dict)
# 3. Define our training loss
# MSELoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#mseloss) needs one text columns and one
# column with embeddings from the teacher model
train_loss = MSELoss(model=student_model)
#### Evaluate cross-lingual performance on different tasks #####
evaluators = [] # evaluators has a list of different evaluator classes we call periodically
# 4. Define evaluators for use during training. This is useful to keep track of alongside the evaluation loss.
evaluators = []
for dev_file in dev_files:
logger.info("Create evaluator for " + dev_file)
src_sentences = []
trg_sentences = []
with gzip.open(dev_file, "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
if splits[0] != "" and splits[1] != "":
src_sentences.append(splits[0])
trg_sentences.append(splits[1])
for subset, eval_dataset in eval_dataset_dict.items():
logger.info(f"Creating evaluators for {subset}")
# Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
dev_mse = evaluation.MSEEvaluator(
src_sentences,
trg_sentences,
name=os.path.basename(dev_file),
dev_mse = MSEEvaluator(
source_sentences=eval_dataset["english"],
target_sentences=eval_dataset["non_english"],
name=subset,
teacher_model=teacher_model,
batch_size=inference_batch_size,
)
evaluators.append(dev_mse)
# TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
dev_trans_acc = evaluation.TranslationEvaluator(
src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
# TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of
# source[i] is the closest to target[i] out of all available target sentences
dev_trans_acc = TranslationEvaluator(
source_sentences=eval_dataset["english"],
target_sentences=eval_dataset["non_english"],
name=subset,
batch_size=inference_batch_size,
)
evaluators.append(dev_trans_acc)
##### Read cross-lingual Semantic Textual Similarity (STS) data ####
all_languages = list(set(list(source_languages) + list(target_languages)))
sts_data = {}
# Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
with zipfile.ZipFile(sts_corpus) as zip:
filelist = zip.namelist()
sts_files = []
for i in range(len(all_languages)):
for j in range(i, len(all_languages)):
lang1 = all_languages[i]
lang2 = all_languages[j]
filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
if filepath not in filelist:
lang1, lang2 = lang2, lang1
filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
if filepath in filelist:
filename = os.path.basename(filepath)
sts_data[filename] = {"sentences1": [], "sentences2": [], "scores": []}
fIn = zip.open(filepath)
for line in io.TextIOWrapper(fIn, "utf8"):
sent1, sent2, score = line.strip().split("\t")
score = float(score)
sts_data[filename]["sentences1"].append(sent1)
sts_data[filename]["sentences2"].append(sent2)
sts_data[filename]["scores"].append(score)
for filename, data in sts_data.items():
test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
data["sentences1"],
data["sentences2"],
data["scores"],
# Try to load this subset from STS17
test_dataset = None
try:
test_dataset = load_dataset("mteb/sts17-crosslingual-sts", subset, split="test")
except Exception:
try:
test_dataset = load_dataset("mteb/sts17-crosslingual-sts", f"{subset[3:]}-{subset[:2]}", split="test")
subset = f"{subset[3:]}-{subset[:2]}"
except Exception:
pass
if test_dataset:
test_evaluator = EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=[score / 5.0 for score in test_dataset["score"]], # Convert 0-5 scores to 0-1 scores
batch_size=inference_batch_size,
name=filename,
name=f"sts17-{subset}-test",
show_progress_bar=False,
)
evaluators.append(test_evaluator)
evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
# Now also prepare the evaluation datasets for training
eval_dataset_dict = eval_dataset_dict.map(prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names)
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=num_train_epochs,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
learning_rate=2e-5,
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=num_evaluation_steps,
save_strategy="steps",
save_steps=num_evaluation_steps,
save_total_limit=2,
logging_steps=100,
run_name=f"multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}", # Will be used in W&B if `wandb` is installed
)
# Train the model
student_model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
epochs=num_epochs,
warmup_steps=num_warmup_steps,
evaluation_steps=num_evaluation_steps,
output_path=output_path,
save_best_model=True,
optimizer_params={"lr": 2e-5, "eps": 1e-6},
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=student_model,
args=args,
train_dataset=train_dataset_dict,
eval_dataset=eval_dataset_dict,
loss=train_loss,
evaluator=evaluator,
)
trainer.train()
# 7. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
student_model.save(final_output_dir)
# 8. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = student_model_name if "/" not in student_model_name else student_model_name.split("/")[-1]
try:
student_model.push_to_hub(f"{model_name}-multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}")
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}')`."
)
# Natural Language Inference
Given two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task of deciding if the premise entails the hypothesis, if they are contradiction or if they are neutral. Commonly used NLI dataset are [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426).
Given two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task of deciding if the premise entails the hypothesis, if they are contradiction, or if they are neutral. Commonly used NLI dataset are [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli).
[Conneau et al.](https://arxiv.org/abs/1705.02364) showed that NLI data can be quite useful when training Sentence Embedding methods. We also found this in our [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) and often use NLI as a first fine-tuning step for sentence embedding methods.
To train on NLI, see the following example files:
- **[training_nli.py](training_nli.py)** - This example uses the Softmax-Classification-Loss, as described in the [SBERT-Paper](https://arxiv.org/abs/1908.10084), to learn sentence embeddings.
- **[training_nli_v2.py](training_nli_v2.py)** - The Softmax-Classification-Loss, as used in our original SBERT paper, does not yield optimal performance. A better loss is [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), where we provide pairs or triplets. In that example, we provide a triplet of the format: (anchor, entailment_sentence, contradiction_sentence). The NLI data provides such triplets. The MultipleNegativesRankingLoss yields much higher performances and is more intuitive than the Softmax-Classification-Loss. We have used this loss to train the paraphrase model in our [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) paper.
- **[training_nli_v3.py](training_nli_v3.py)** - Following the [GISTEmbed](https://arxiv.org/abs/2402.16829) paper, we can modify the in-batch negative selection from [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) using a guiding model. Candidate negative pairs are ignored during training if the guiding model considers the pair to be too similar. In practice, the [GISTEmbedLoss](https://www.sbert.net/docs/package_reference/losses.html#gistembedloss) tends to produce a stronger training signal than `MultipleNegativesRankingLoss` at the cost of some training overhead for running inference on the guiding model.
1. **[training_nli.py](training_nli.py)**:
```eval_rst
This example uses :class:`~sentence_transformers.losses.SoftmaxLoss` as described in the original [Sentence Transformers paper](https://arxiv.org/abs/1908.10084).
```
2. **[training_nli_v2.py](training_nli_v2.py)**:
```eval_rst
The :class:`~sentence_transformers.losses.SoftmaxLoss` as used in our original SBERT paper does not yield optimal performance. A better loss is :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss`, where we provide pairs or triplets. In this script, we provide a triplet of the format: (anchor, entailment_sentence, contradiction_sentence). The NLI data provides such triplets. The :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` yields much higher performances and is more intuitive than :class:`~sentence_transformers.losses.SoftmaxLoss`. We have used this loss to train the paraphrase model in our `Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation <https://arxiv.org/abs/2004.09813>`_ paper.
```
3. **[training_nli_v3.py](training_nli_v3.py)**
```eval_rst
Following the `GISTEmbed <https://arxiv.org/abs/2402.16829>`_ paper, we can modify the in-batch negative selection from :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` using a guiding model. Candidate negative pairs are ignored during training if the guiding model considers the pair to be too similar. In practice, the :class:`~sentence_transformers.losses.GISTEmbedLoss` tends to produce a stronger training signal than :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` at the cost of some training overhead for running inference on the guiding model.
```
## Data
In our experiments we combine [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426), which we call AllNLI. These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:
We combine [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli) into a dataset we call [AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli). These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:
| Sentence A (Premise) | Sentence B (Hypothesis) | Label |
| --- | --- | --- |
......@@ -18,45 +27,45 @@ In our experiments we combine [SNLI](https://arxiv.org/abs/1508.05326) and [Mult
| An older and younger man smiling. | Two men are smiling and laughing at the cats playing on the floor. | neutral |
| A man inspects the uniform of a figure in some East Asian country. | The man is sleeping. | contradiction |
We format AllNLI in a few different subsets, compatible with different loss functions. See for example the [triplet subset of AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet).
## SoftmaxLoss
[Conneau et al.](https://arxiv.org/abs/1705.02364) described how a softmax classifier on top of a siamese network can be used to learn meaningful sentence representation. We can achieve this by using the [losses.SoftmaxLoss](../../../docs/package_reference/losses.html#softmaxloss) package.
```eval_rst
`Conneau et al. <https://arxiv.org/abs/1705.02364>`_ described how a softmax classifier on top of a `siamese network <https://en.wikipedia.org/wiki/Siamese_neural_network>`_ can be used to learn meaningful sentence representation. We can achieve this by using :class:`~sentence_transformers.losses.SoftmaxLoss`:
```
<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png" alt="SBERT SoftmaxLoss" width="250"/>
The softmax loss looks like this:
We pass the two sentences through our SentenceTransformer model and get the sentence embeddings *u* and *v*. We then concatenate *u*, *v* and *|u-v|* to form one long vector. This vector is then passed to a softmax classifier, which predicts our three classes (entailment, neutral, contradiction).
![SBERT SoftmaxLoss](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png "SBERT SoftmaxLoss")
We pass the two sentences through our SentenceTransformer network and get the sentence embeddings *u* and *v*. We then concatenate u, v and |u-v| to form one, long vector. This vector is then passed to a softmax classifier, which predicts our three classes (entailment, neutral, contradiction).
This setup learns sentence embeddings, that can later be used for wide variety of tasks.
This setup learns sentence embeddings that can later be used for wide variety of tasks.
## MultipleNegativesRankingLoss
```eval_rst
That the :class:`~sentence_transformers.losses.SoftmaxLoss` with NLI data produces (relatively) good sentence embeddings is rather coincidental. The :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` is much more intuitive and produces significantly better sentence representations.
```
That the softmax-loss with NLI data produces (relatively) good sentence embeddings is rather coincidental. The [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) is much more intuitive and produces also significantly better sentence representations.
The training data for MultipleNegativesRankingLoss consists of sentence pairs [(a<sub>1</sub>, b<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>)] where we assume that (a<sub>i</sub>, b<sub>i</sub>) are similar sentences and (a<sub>i</sub>, b<sub>j</sub>) are dissimilar sentences for i != j. The minimizes the distance between (a<sub>i</sub>, b<sub>i</sub>) while it simultaneously maximizes the distance (a<sub>i</sub>, b<sub>j</sub>) for all i != j.
The training data for MultipleNegativesRankingLoss consists of sentence pairs [(a<sub>1</sub>, b<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>)] where we assume that (a<sub>i</sub>, b<sub>i</sub>) are similar sentences and (a<sub>i</sub>, b<sub>j</sub>) are dissimilar sentences for i != j. The minimizes the distance between (a<sub>i</sub>, b<sub>i</sub>) while it simultaneously maximizes the distance (a<sub>i</sub>, b<sub>j</sub>) for all i != j. For example, in the following picture:
For example in the following picture:
![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png)
<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png" alt="SBERT MultipleNegativeRankingLoss" width="350"/>
The distance between (a<sub>1</sub>, b<sub>1</sub>) is reduced, while the distance between (a<sub>1</sub>, b<sub>2...5</sub>) will be increased. The same is done for a<sub>2</sub>, ..., a<sub>5</sub>.
Using MultipleNegativeRankingLoss with NLI is rather easy: We define sentences that have an *entailment* label as positive pairs. E.g, we have pairs like (*"A soccer game with multiple males playing."*, *"Some men are playing a sport."*) and want that these pairs are close in vector space.
```eval_rst
Using :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` with NLI is rather easy: We define sentences that have an *entailment* label as positive pairs. E.g, we have pairs like (*"A soccer game with multiple males playing."*, *"Some men are playing a sport."*) and want that these pairs are close in vector space. The `pair subset of AllNLI <https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/pair>`_ has been prepared in this format.
```
### MultipleNegativesRankingLoss with Hard Negatives
We can further improve MultipleNegativesRankingLoss by not only providing pairs, but by providing triplets: [(a<sub>1</sub>, b<sub>1</sub>, c<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>, c<sub>n</sub>)]
The entry for c<sub>i</sub> are so-called hard-negatives: On a lexical level, they are similar to a<sub>i</sub> and b<sub>i</sub>. But on a semantic level, they mean different things and should not be close in the vector space.
We can further improve MultipleNegativesRankingLoss by providing triplets rather than pairs: [(a<sub>1</sub>, b<sub>1</sub>, c<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>, c<sub>n</sub>)]. The samples for c<sub>i</sub> are so-called hard-negatives: On a lexical level, they are similar to a<sub>i</sub> and b<sub>i</sub>, but on a semantic level, they mean different things and should not be close to a<sub>i</sub> in the vector space.
For NLI data, we can use the contradiction-label to create such triplets with a hard negative. So our triplets look like this:
("*A soccer game with multiple males playing."*, *"Some men are playing a sport."*, *"A group of men playing a baseball game."*).
("*A soccer game with multiple males playing."*, *"Some men are playing a sport."*, *"A group of men playing a baseball game."*). We want the sentences *"A soccer game with multiple males playing."* and *"Some men are playing a sport."* to be close in the vector space, while there should be a larger distance between *"A soccer game with multiple males playing."* and "*A group of men playing a baseball game."*. The [triplet subset of AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet) has been prepared in this format.
### GISTEmbedLoss
```eval_rst
:class:`~sentence_transformers.losses.MultipleNegativesRankingLoss` can be extended even further by recognizing that the in-batch negative sampling as shown in `this example <#multiplenegativesrankingloss>`_ is a bit flawed. In particular, we automatically assume that the pairs (a\ :sub:`1`\ , b\ :sub:`2`\ ), ..., (a\ :sub:`1`\ , b\ :sub:`n`\ ) are negative, but that does not strictly have to be true.
We want the sentences *"A soccer game with multiple males playing."* and *"Some men are playing a sport."* to be close in the vector space, while there should be a larger distance between *"A soccer game with multiple males playing."* and "*A group of men playing a baseball game."*.
To address this, :class:`~sentence_transformers.losses.GISTEmbedLoss` uses a Sentence Transformer model to guide the in-batch negative sample selection. In particular, if the guide model considers the similarity of (a\ :sub:`1`\ , b\ :sub:`n`\ ) to be larger than (a\ :sub:`1`\ , b\ :sub:`1`\ ), then the (a\ :sub:`1`\ , b\ :sub:`n`\ ) pair is considered a false negative and consequently ignored in the training process. In essence, this results in higher quality training data for the model.
```
\ No newline at end of file
......@@ -10,128 +10,113 @@ OR
python training_nli.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
import traceback
from datetime import datetime
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
# You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
# Read the dataset
train_batch_size = 16
output_dir = "output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = (
"output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# Read the AllNLI.tsv.gz file and create the training dataset
# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
# We'll start with 10k training samples, but you can increase this to get a stronger model
logging.info("Read AllNLI train dataset")
train_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="train").select(range(10000))
eval_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="dev").select(range(1000))
logging.info(train_dataset)
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
label_id = label2int[row["label"]]
train_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#softmaxloss
train_loss = losses.SoftmaxLoss(
model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
model=model,
sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
num_labels=3,
)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
sentences1=stsb_eval_dataset["sentence1"],
sentences2=stsb_eval_dataset["sentence2"],
scores=stsb_eval_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
logging.info("Evaluation before training:")
dev_evaluator(model)
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=100,
run_name="nli-v1", # Will be used in W&B if `wandb` is installed
)
# Configure the training
num_epochs = 1
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
trainer.train()
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
test_evaluator(model)
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-v1")
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-v1')`."
)
......@@ -10,23 +10,20 @@ OR
python training_nli_v2.py pretrained_transformer_model_name
"""
import math
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
import traceback
from datetime import datetime
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import BatchSamplers, SentenceTransformerTrainingArguments
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
......@@ -34,121 +31,94 @@ max_seq_length = 75
num_epochs = 1
# Save path of the model
model_save_path = (
output_dir = (
"output/training_nli_v2_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Read the AllNLI.tsv.gz file and create the training dataset
# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
# We'll start with 10k training samples, but you can increase this to get a stronger model
logging.info("Read AllNLI train dataset")
train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train").select(range(10000))
eval_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="dev").select(range(1000))
logging.info(train_dataset)
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# Our training loss
# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss
train_loss = losses.MultipleNegativesRankingLoss(model)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
sentences1=stsb_eval_dataset["sentence1"],
sentences2=stsb_eval_dataset["sentence2"],
scores=stsb_eval_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
logging.info("Evaluation before training:")
dev_evaluator(model)
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
batch_sampler=BatchSamplers.NO_DUPLICATES,
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=10,
save_strategy="steps",
save_steps=10,
save_total_limit=2,
logging_steps=100,
run_name="nli-v2", # Will be used in W&B if `wandb` is installed
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
trainer.train()
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
test_evaluator(model)
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-v2")
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-v2')`."
)
......@@ -10,23 +10,20 @@ OR
python training_nli_v3.py pretrained_transformer_model_name
"""
import math
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
import traceback
from datetime import datetime
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import BatchSamplers, SentenceTransformerTrainingArguments
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
......@@ -34,124 +31,95 @@ max_seq_length = 75
num_epochs = 1
# Save path of the model
model_save_path = (
output_dir = (
"output/training_nli_v3_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# Read the AllNLI.tsv.gz file and create the training dataset
# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
# We'll start with 10k training samples, but you can increase this to get a stronger model
logging.info("Read AllNLI train dataset")
train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train").select(range(10000))
eval_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="dev").select(range(1000))
logging.info(train_dataset)
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# 3. Define our training loss: https://sbert.net/docs/package_reference/sentence_transformer/losses.html#gistembedloss
# The guiding model
guide_model = SentenceTransformer("all-MiniLM-L6-v2")
# Our training loss
train_loss = losses.GISTEmbedLoss(model, guide_model)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
sentences1=stsb_eval_dataset["sentence1"],
sentences2=stsb_eval_dataset["sentence2"],
scores=stsb_eval_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
logging.info("Evaluation before training:")
dev_evaluator(model)
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
batch_sampler=BatchSamplers.NO_DUPLICATES,
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=10,
save_strategy="steps",
save_steps=10,
save_total_limit=2,
logging_steps=100,
run_name="nli-v3", # Will be used in W&B if `wandb` is installed
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
trainer.train()
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
test_evaluator(model)
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-v3")
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-v3')`."
)
......@@ -16,18 +16,18 @@ which sentence with another label is the closest (hard negative example). It the
all sentences with the same label should be close and sentences for different labels should be clearly separated.
"""
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
import logging
import os
import random
from collections import defaultdict
from datetime import datetime
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, SentenceTransformer, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.readers import InputExample
logging.basicConfig(
format="%(asctime)s - %(message)s",
......
......@@ -4,126 +4,130 @@ This is an example how to train SentenceTransformers in a multi-task setup.
The system trains BERT on the AllNLI and on the STSbenchmark dataset.
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
import traceback
from datetime import datetime
import gzip
import csv
import os
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.losses import CosineSimilarityLoss, SoftmaxLoss
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import MultiDatasetBatchSamplers, SentenceTransformerTrainingArguments
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
# Read the dataset
model_name = "bert-base-uncased"
num_train_epochs = 1
batch_size = 16
model_save_path = "output/training_multi-task_" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "datasets/AllNLI.tsv.gz"
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
output_dir = "output/training_multi-task_" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# If we want, we can limit the maximum sequence length for the model
# model.max_seq_length = 75
logging.info(model)
# 2a. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
nli_train_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="train")
nli_eval_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="dev").select(range(1000))
logging.info(nli_train_dataset)
# 2b. Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
stsb_train_dataset = load_dataset("sentence-transformers/stsb", split="train")
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
stsb_test_dataset = load_dataset("sentence-transformers/stsb", split="test")
logging.info(stsb_train_dataset)
# 3. Define our training losses
# 3a. SoftmaxLoss for the NLI data (sentence_A, sentence_B, class), see also https://sbert.net/docs/training/loss_overview.html
train_loss_nli = SoftmaxLoss(
model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_nli_samples = []
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
label_id = label2int[row["label"]]
train_nli_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
train_dataloader_nli = DataLoader(train_nli_samples, shuffle=True, batch_size=batch_size)
train_loss_nli = losses.SoftmaxLoss(
model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
# 3b. CosineSimilarityLoss for the STSB data (sentence_A, sentence_B, similarity score between 0 and 1)
train_loss_sts = CosineSimilarityLoss(model=model)
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
dev_evaluator = EmbeddingSimilarityEvaluator(
sentences1=stsb_eval_dataset["sentence1"],
sentences2=stsb_eval_dataset["sentence2"],
scores=stsb_eval_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
logging.info("Read STSbenchmark train dataset")
train_sts_samples = []
dev_sts_samples = []
test_sts_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_sts_samples.append(inp_example)
elif row["split"] == "test":
test_sts_samples.append(inp_example)
else:
train_sts_samples.append(inp_example)
train_dataloader_sts = DataLoader(train_sts_samples, shuffle=True, batch_size=batch_size)
train_loss_sts = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_sts_samples, name="sts-dev")
# Configure the training
num_epochs = 4
warmup_steps = math.ceil(len(train_dataloader_sts) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data)
# and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data)
# You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way.
train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)]
# Train the model
model.fit(
train_objectives=train_objectives,
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=num_train_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
# With ROUND_ROBIN you'll sample the same amount from each dataset, until one of the multi-datasets is exhausted
# The alternative is PROPORTIONAL, which samples from each dataset in proportion to the dataset size,
# but that will lead to a lot of samples from the larger dataset (AllNLI in this case)
multi_dataset_batch_sampler=MultiDatasetBatchSamplers.ROUND_ROBIN,
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=100,
run_name="multi-task", # Will be used in W&B if `wandb` is installed
)
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset={
"all-nli": nli_train_dataset,
"sts": stsb_train_dataset,
},
eval_dataset={
"all-nli": nli_eval_dataset,
"sts": stsb_eval_dataset,
},
loss={
"all-nli": train_loss_nli,
"sts": train_loss_sts,
},
evaluator=dev_evaluator,
)
trainer.train()
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_sts_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_evaluator = EmbeddingSimilarityEvaluator(
sentences1=stsb_test_dataset["sentence1"],
sentences2=stsb_test_dataset["sentence2"],
scores=stsb_test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model)
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-multi-task")
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-multi-task')`."
)
......@@ -4,106 +4,110 @@ This script trains sentence transformers with a triplet loss function.
As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
"""
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
from zipfile import ZipFile
import csv
import logging
import os
import traceback
from datetime import datetime
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.losses import TripletLoss
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = "distilbert-base-uncased"
batch_size = 16
num_train_epochs = 1
dataset_path = "datasets/wikipedia-sections"
if not os.path.exists(dataset_path):
os.makedirs(dataset_path, exist_ok=True)
filepath = os.path.join(dataset_path, "wikipedia-sections-triplets.zip")
util.http_get("https://sbert.net/datasets/wikipedia-sections-triplets.zip", filepath)
with ZipFile(filepath, "r") as zip:
zip.extractall(dataset_path)
output_dir = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
### Create a torch.DataLoader that passes training batch instances to our model
train_batch_size = 16
output_path = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# If we want, we can limit the maximum sequence length for the model
# model.max_seq_length = 75
logging.info(model)
### Configure sentence transformers for training and train on the provided dataset
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
# 2. Load the Wikipedia-Sections dataset: https://huggingface.co/datasets/sentence-transformers/wikipedia-sections
train_dataset = load_dataset("sentence-transformers/wikipedia-sections", "triplet", split="train").select(
range(10_000)
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
logger.info("Read Triplet train dataset")
train_examples = []
with open(os.path.join(dataset_path, "train.csv"), encoding="utf-8") as fIn:
reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
train_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]], label=0))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)
logger.info("Read Wikipedia Triplet dev dataset")
dev_examples = []
with open(os.path.join(dataset_path, "validation.csv"), encoding="utf-8") as fIn:
reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
dev_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
if len(dev_examples) >= 1000:
break
evaluator = TripletEvaluator.from_input_examples(dev_examples, name="dev")
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=output_path,
eval_dataset = load_dataset("sentence-transformers/wikipedia-sections", "triplet", split="validation").select(
range(1000)
)
test_dataset = load_dataset("sentence-transformers/wikipedia-sections", "triplet", split="test").select(range(1000))
logging.info(train_dataset)
# 3. Define our training loss
# TripletLoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#tripletloss) needs three text columns
train_loss = TripletLoss(model)
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
dev_evaluator = TripletEvaluator(
anchors=eval_dataset[:1000]["anchor"],
positives=eval_dataset[:1000]["positive"],
negatives=eval_dataset[:1000]["negative"],
name="wikipedia-sections-dev",
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=num_train_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=100,
run_name="wikipedia-sections-triplet", # Will be used in W&B if `wandb` is installed
)
logger.info("Read test examples")
test_examples = []
with open(os.path.join(dataset_path, "test.csv"), encoding="utf-8") as fIn:
reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
test_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
)
trainer.train()
model = SentenceTransformer(output_path)
test_evaluator = TripletEvaluator.from_input_examples(test_examples, name="test")
test_evaluator(model, output_path=output_path)
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_evaluator = TripletEvaluator(
anchors=test_dataset["anchor"],
positives=test_dataset["positive"],
negatives=test_dataset["negative"],
name="wikipedia-sections-test",
)
test_evaluator(model)
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-wikipedia-sections-triplet")
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{model_name}-wikipedia-sections-triplet')`."
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment