Commit 0fccd232 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add

parents
Pipeline #1027 failed with stages
in 0 seconds
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
Usage:
python matryoshka_nli.py
OR
python matryoshka_nli.py pretrained_transformer_model_name
"""
import math
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
# Save path of the model
model_save_path = (
"output/matryoshka_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-matryoshka")
except Exception:
logging.error(
"Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-matryoshka')`."
)
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
The difference between this script and matryoshka_nli.py is that this script uses a reduced dimensionality of the base
model by adding a Dense layer with `reduced_dim=256` output dimensions. This might be useful when your desired output
dimensionality is lower than the base model's default output dimensionality.
Usage:
python matryoshka_nli_reduced_dim.py
OR
python matryoshka_nli_reduced_dim.py pretrained_transformer_model_name
"""
import math
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
reduced_dim = 256
# Save path of the model
model_save_path = (
"output/matryoshka_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
dense = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=reduced_dim)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [256, 128, 64, 32, 16])
stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-matryoshka-{reduced_dim}")
except Exception:
logging.error(
"Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-matryoshka-{reduced_dim}')`."
)
"""
This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch.
It uses MatryoshkaLoss with the powerful CoSENTLoss to train models that perform well at output dimensions [768, 512, 256, 128, 64].
It generates sentence embeddings that can be compared using cosine-similarity to measure the similarity.
Usage:
python matryoshka_sts.py
OR
python matryoshka_sts.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = (
"output/matryoshka_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CoSENTLoss(model=model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)
# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-sts-matryoshka")
except Exception:
logging.error(
"Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
f"and saving it using `model.push_to_hub('{model_name}-sts-matryoshka')`."
)
# MS MARCO
[MS MARCO Passage Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) is a large dataset to train models for information retrieval. It consists of about 500k real search queries from Bing search engine with the relevant text passage that answers the query.
This pages shows how to **train** models (Cross-Encoder and Sentence Embedding Models) on this dataset so that it can be used for searching text passages given queries (key words, phrases or questions).
If you are interested in how to use these models, see [Application - Retrieve & Re-Rank](../../applications/retrieve_rerank/README.md).
There are **pre-trained models** available, which you can directly use without the need of training your own models. For more information, see: [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html) | [Pretrained Cross-Encoders](https://www.sbert.net/docs/pretrained_cross-encoders.html)
## Bi-Encoder
Cross-Encoder are only suitable for reranking a small set of passages. For retrieval of suitable documents from a large collection, we have to use a bi-encoder. The documents are independently encoded into fixed-sized embeddings. A query is embedded into the same vector space. Relevant documents can then be found by using dot-product.
![BiEncoder](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/BiEncoder.png)
There are two strategies to **train an bi-encoder** on the MS MARCO dataset:
### MultipleNegativesRankingLoss
**Training code: [train_bi-encoder_mnrl.py](train_bi-encoder_mnrl.py)**
When we use [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), we provide triplets: ``(query, positive_passage, negative_passage)`` where `positive_passage` is the relevant passage to the query and `negative_passage` is a non-relevant passage to the query.
We compute the embeddings for all queries, positive passages, and negative passages in the corpus and then optimize the following objective: We want to have the `(query, positive_passage)` pair to be close in the vector space, while `(query, negative_passage)` should be distant in vector space.
To further improve the training, we use **in-batch negatives**:
![MultipleNegativesRankingLoss](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png)
We embed all `queries`, `positive_passages`, and `negative_passages` into the vector space. The matching `(query_i, positive_passage_i)` should be close, while there should be a large distance between a `query` and all other (positive/negative) passages from all other triplets in a batch. For a batch size of 64, we compare a query against 64+64=128 passages, from which only one passage should be close and the 127 others should be distant in vector space.
One way to **improve training** is to choose really good negatives, also know as **hard negative**: The negative should look really similar to the positive passage, but it should not be relevant to the query.
We find these hard negatives in the following way: We use existing retrieval systems (e.g. lexical search and other bi-encoder retrieval systems), and for each query we find the most relevant passages. We then use a powerful [Cross-Encoder](../../applications/cross-encoder/README.md) to score the found `(query, passage)` pairs. We provide scores for 160 million such pairs in our [msmarco-hard-negatives dataset](https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives).
For MultipleNegativesRankingLoss, we must ensure that in the triplet `(query, positive_passage, negative_passage)` that the `negative_passage` is actually not relevant for the query. The MS MARCO dataset is sadly **highly redundant**, and even though that there is on average only one passage marked as relevant for a query, it actually contains many passages that humans would consider as relevant. We must ensure that these passages are **not passed as negatives**: We do this by ensuring a certain threshold in the CrossEncoder scores between the relevant passages and the mined hard negative. By default, we set a threshold of 3: If the `(query, positive_passage)` gets a score of 9 from the CrossEncoder, than we will only consider negatives with a score below 6 from the CrossEncoder. This threshold ensures that we actually use negatives in our triplets.
### MarginMSE
**Training code: [train_bi-encoder_margin-mse.py](train_bi-encoder_margin-mse.py)**
[MarginMSELoss](https://www.sbert.net/docs/package_reference/losses.html#marginmseloss) is based on the paper of [Hofstätter et al](https://arxiv.org/abs/2010.02666). As for MultipleNegativesRankingLoss, we have triplets: `(query, passage1, passage2)`. In contrast to MultipleNegativesRankingLoss, `passage1` and `passage2` do not have to be strictly positive/negative, both can be relevant or not relevant for a given query.
We then compute the [Cross-Encoder](../../applications/cross-encoder/README.md) score for `(query, passage1)` and `(query, passage2)`. We provide scores for 160 million such pairs in our [msmarco-hard-negatives dataset](https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives). We then compute the distance: `CE_distance = CEScore(query, passage1) - CEScore(query, passage2)`
For our bi-encoder training, we encode `query`, `passage1`, and `passage2` into vector spaces and then measure the dot-product between `(query, passage1)` and `(query, passage2)`. Again, we measure the distance: `BE_distance = DotScore(query, passage1) - DotScore(query, passage2)`
We then want to ensure that the distance predicted by the bi-encoder is close to the distance predicted by the cross-encoder, i.e., we optimize the mean-squared error (MSE) between `CE_distance` and `BE_distance`.
An **advantage** of MarginMSELoss compared to MultipleNegativesRankingLoss is that we **don't require** a `positive` and `negative` passage. As mentioned before, MS MARCO is redundant, and many passages contain the same or similar content. With MarginMSELoss, we can train on two relevant passages without issues: In that case, the `CE_distance` will be smaller and we expect that our bi-encoder also puts both passages closer in the vector space.
And **disadvantage** of MarginMSELoss is the slower training time: We need way more epochs to get good results. In MultipleNegativesRankingLoss, with a batch size of 64, we compare one query against 128 passages. With MarginMSELoss, we compare a query only against two passages.
## Cross-Encoder
A [Cross-Encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) accepts both inputs, the query and the possible relevant passage and returns a score between 0 and 1 how relevant the passage is for the given query.
![CrossEncoder](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CrossEncoder.png)
Cross-Encoders are often used for **re-ranking:** Given a list with possible relevant passages for a query, for example retrieved from BM25 / Elasticsearch, the cross-encoder re-ranks this list so that the most relevant passages are the top of the result list.
To **train an cross-encoder** on the MS MARCO dataset, see:
- **[train_cross-encoder_scratch.py](train_cross-encoder_scratch.py)** trains a cross-encoder from scratch using the provided data from the MS MARCO dataset.
## Cross-Encoder Knowledge Distillation
![](https://github.com/UKPLab/sentence-transformers/raw/master/docs/img/msmarco-training-ce-distillation.png)
- **[train_cross-encoder_kd.py](train_cross-encoder_kd.py)** uses a knowledge distillation setup: [Hostätter et al.](https://arxiv.org/abs/2010.02666) trained an ensemble of 3 (large) models for the MS MARCO dataset and predicted the scores for various (query, passage)-pairs (50% positive, 50% negative). In this example, we use knowledge distillation with a small & fast model and learn the logits scores from the teacher ensemble. This yields performances comparable to large models, while being 18 times faster.
\ No newline at end of file
"""
This file evaluates CrossEncoder on the TREC 2019 Deep Learning (DL) Track: https://arxiv.org/abs/2003.07820
TREC 2019 DL is based on the corpus of MS Marco. MS Marco provides a sparse annotation, i.e., usually only a single
passage is marked as relevant for a given query. Many other highly relevant passages are not annotated and hence are treated
as an error if a model ranks those high.
TREC DL instead annotated up to 200 passages per query for their relevance to a given query. It is better suited to estimate
the model performance for the task of reranking in Information Retrieval.
Run:
python eval_cross-encoder-trec-dl.py cross-encoder-model-name
"""
import gzip
from collections import defaultdict
import logging
import tqdm
import numpy as np
import sys
import pytrec_eval
from sentence_transformers import util, CrossEncoder
import os
data_folder = "trec2019-data"
os.makedirs(data_folder, exist_ok=True)
# Read test queries
queries = {}
queries_filepath = os.path.join(data_folder, "msmarco-test2019-queries.tsv.gz")
if not os.path.exists(queries_filepath):
logging.info("Download " + os.path.basename(queries_filepath))
util.http_get(
"https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz", queries_filepath
)
with gzip.open(queries_filepath, "rt", encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
queries[qid] = query
# Read which passages are relevant
relevant_docs = defaultdict(lambda: defaultdict(int))
qrels_filepath = os.path.join(data_folder, "2019qrels-pass.txt")
if not os.path.exists(qrels_filepath):
logging.info("Download " + os.path.basename(qrels_filepath))
util.http_get("https://trec.nist.gov/data/deep/2019qrels-pass.txt", qrels_filepath)
with open(qrels_filepath) as fIn:
for line in fIn:
qid, _, pid, score = line.strip().split()
score = int(score)
if score > 0:
relevant_docs[qid][pid] = score
# Only use queries that have at least one relevant passage
relevant_qid = []
for qid in queries:
if len(relevant_docs[qid]) > 0:
relevant_qid.append(qid)
# Read the top 1000 passages that are supposed to be re-ranked
passage_filepath = os.path.join(data_folder, "msmarco-passagetest2019-top1000.tsv.gz")
if not os.path.exists(passage_filepath):
logging.info("Download " + os.path.basename(passage_filepath))
util.http_get(
"https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz", passage_filepath
)
passage_cand = {}
with gzip.open(passage_filepath, "rt", encoding="utf8") as fIn:
for line in fIn:
qid, pid, query, passage = line.strip().split("\t")
if qid not in passage_cand:
passage_cand[qid] = []
passage_cand[qid].append([pid, passage])
logging.info("Queries: {}".format(len(queries)))
queries_result_list = []
run = {}
model = CrossEncoder(sys.argv[1], max_length=512)
for qid in tqdm.tqdm(relevant_qid):
query = queries[qid]
cand = passage_cand[qid]
pids = [c[0] for c in cand]
corpus_sentences = [c[1] for c in cand]
cross_inp = [[query, sent] for sent in corpus_sentences]
if model.config.num_labels > 1: # Cross-Encoder that predict more than 1 score, we use the last and apply softmax
cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
else:
cross_scores = model.predict(cross_inp).tolist()
cross_scores_sparse = {}
for idx, pid in enumerate(pids):
cross_scores_sparse[pid] = cross_scores[idx]
sparse_scores = cross_scores_sparse
run[qid] = {}
for pid in sparse_scores:
run[qid][pid] = float(sparse_scores[pid])
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {"ndcg_cut.10"})
scores = evaluator.evaluate(run)
print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()]) * 100))
"""
This script runs the evaluation of an SBERT msmarco model on the
MS MARCO dev dataset and reports different performances metrices for cossine similarity & dot-product.
Usage:
python eval_msmarco.py model_name [max_corpus_size_in_thousands]
"""
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation, util
import logging
import sys
import os
import tarfile
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Name of the SBERT model
model_name = sys.argv[1]
# You can limit the approx. max size of the corpus. Pass 100 as second parameter and the corpus has a size of approx 100k docs
corpus_max_size = int(sys.argv[2]) * 1000 if len(sys.argv) >= 3 else 0
#### Load model
model = SentenceTransformer(model_name)
### Data files
data_folder = "msmarco-data"
os.makedirs(data_folder, exist_ok=True)
collection_filepath = os.path.join(data_folder, "collection.tsv")
dev_queries_file = os.path.join(data_folder, "queries.dev.small.tsv")
qrels_filepath = os.path.join(data_folder, "qrels.dev.tsv")
### Download files if needed
if not os.path.exists(collection_filepath) or not os.path.exists(dev_queries_file):
tar_filepath = os.path.join(data_folder, "collectionandqueries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download: " + tar_filepath)
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
if not os.path.exists(qrels_filepath):
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv", qrels_filepath)
### Load data
corpus = {} # Our corpus pid => passage
dev_queries = {} # Our dev queries. qid => query
dev_rel_docs = {} # Mapping qid => set with relevant pids
needed_pids = set() # Passage IDs we need
needed_qids = set() # Query IDs we need
# Load the 6980 dev queries
with open(dev_queries_file, encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
dev_queries[qid] = query.strip()
# Load which passages are relevant for which queries
with open(qrels_filepath) as fIn:
for line in fIn:
qid, _, pid, _ = line.strip().split("\t")
if qid not in dev_queries:
continue
if qid not in dev_rel_docs:
dev_rel_docs[qid] = set()
dev_rel_docs[qid].add(pid)
needed_pids.add(pid)
needed_qids.add(qid)
# Read passages
with open(collection_filepath, encoding="utf8") as fIn:
for line in fIn:
pid, passage = line.strip().split("\t")
passage = passage
if pid in needed_pids or corpus_max_size <= 0 or len(corpus) <= corpus_max_size:
corpus[pid] = passage.strip()
## Run evaluator
logging.info("Queries: {}".format(len(dev_queries)))
logging.info("Corpus: {}".format(len(corpus)))
ir_evaluator = evaluation.InformationRetrievalEvaluator(
dev_queries,
corpus,
dev_rel_docs,
show_progress_bar=True,
corpus_chunk_size=100000,
precision_recall_at_k=[10, 100],
name="msmarco dev",
)
ir_evaluator(model)
# MS MARCO - Multilingual Training
This folder demonstrates how to train a multi-lingual SBERT model for [semantic search](https://www.sbert.net/examples/applications/semantic-search/README.html) / [information retrieval](https://www.sbert.net/examples/applications/retrieve_rerank/README.html).
As dataset, we use the [MS Marco Passage Ranking dataset](https://github.com/microsoft/MSMARCO-Passage-Ranking). It is a large dataset consisting of search queries from Bing search engine with the relevant text passage that answers the query.
Sadly this dataset is only available in English. As there are no large, multi-lingual datasets available suitable to train a semantic search model, we will use **machine translation** to translate the training data.
## Translating Data
We will translate the queries and the passages using [EasyNMT](https://github.com/UKPLab/EasyNMT), which provides state-of-the-art machine translation to 150+ languages.
Then, we will use [Multilingual Knowledge Distillation](https://www.sbert.net/examples/training/multilingual/README.html) and transform the English model trained on MS MARCO to a multi-lingual model.
"""
This script translates the queries in the MS MARCO dataset to the defined target languages.
For machine translation, we use EasyNMT: https://github.com/UKPLab/EasyNMT
You can install it via: pip install easynmt
Usage:
python translate_queries [target_language]
"""
import os
from sentence_transformers import LoggingHandler, util
import logging
import tarfile
from easynmt import EasyNMT
import sys
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
target_lang = sys.argv[1]
output_folder = "multilingual-data"
data_folder = "../msmarco-data"
output_filename = os.path.join(output_folder, "train_queries.en-{}.tsv".format(target_lang))
os.makedirs(output_folder, exist_ok=True)
## Does the output file exists? If yes, read it so we can continue the translation
translated_qids = set()
if os.path.exists(output_filename):
with open(output_filename, "r", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
translated_qids.add(splits[0])
### Now we read the MS Marco dataset
os.makedirs(data_folder, exist_ok=True)
# Read qrels file for relevant positives per query
train_queries = {}
qrels_train = os.path.join(data_folder, "qrels.train.tsv")
if not os.path.exists(qrels_train):
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv", qrels_train)
with open(qrels_train) as fIn:
for line in fIn:
qid, _, pid, _ = line.strip().split()
if qid not in translated_qids:
train_queries[qid] = None
# Read all queries
queries_filepath = os.path.join(data_folder, "queries.train.tsv")
if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(queries_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
if qid in train_queries:
train_queries[qid] = query.strip()
qids = [qid for qid in train_queries if train_queries[qid] is not None]
queries = [train_queries[qid] for qid in qids]
# Define our translation model
translation_model = EasyNMT("opus-mt")
print("Start translation of {} queries.".format(len(queries)))
print("This can take a while. But you can stop this script at any point")
with open(output_filename, "a" if os.path.exists(output_filename) else "w", encoding="utf8") as fOut:
for qid, query, translated_query in zip(
qids,
queries,
translation_model.translate_stream(
queries,
source_lang="en",
target_lang=target_lang,
beam_size=2,
perform_sentence_splitting=False,
chunk_size=256,
batch_size=64,
),
):
fOut.write("{}\t{}\n".format(qid, translated_query.replace("\t", " ")))
fOut.flush()
import sys
import json
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
from torch.utils.data import Dataset
import random
from shutil import copyfile
import pickle
import argparse
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
parser = argparse.ArgumentParser()
parser.add_argument("--train_batch_size", default=64, type=int)
parser.add_argument("--max_seq_length", default=300, type=int)
parser.add_argument("--model_name", required=True)
parser.add_argument("--max_passages", default=0, type=int)
parser.add_argument("--epochs", default=30, type=int)
parser.add_argument("--pooling", default="mean")
parser.add_argument(
"--negs_to_use",
default=None,
help="From which systems should negatives be used? Multiple systems separated by comma. None = all",
)
parser.add_argument("--warmup_steps", default=1000, type=int)
parser.add_argument("--lr", default=2e-5, type=float)
parser.add_argument("--num_negs_per_system", default=5, type=int)
parser.add_argument("--use_pre_trained_model", default=False, action="store_true")
parser.add_argument("--use_all_queries", default=False, action="store_true")
args = parser.parse_args()
logging.info(str(args))
# The model we want to fine-tune
train_batch_size = (
args.train_batch_size
) # Increasing the train batch size improves the model performance, but requires more GPU memory
model_name = args.model_name
max_passages = args.max_passages
max_seq_length = args.max_seq_length # Max length for passages. Increasing it, requires more GPU memory
num_negs_per_system = (
args.num_negs_per_system
) # We used different systems to mine hard negatives. Number of hard negatives to add from each system
num_epochs = args.epochs # Number of epochs we want to train
# Load our embedding model
if args.use_pre_trained_model:
logging.info("use pretrained SBERT model")
model = SentenceTransformer(model_name)
model.max_seq_length = max_seq_length
else:
logging.info("Create new SBERT model")
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), args.pooling)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
model_save_path = f'output/train_bi-encoder-margin_mse-{model_name.replace("/", "-")}-batch_size_{train_batch_size}-{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
# Write self to path
os.makedirs(model_save_path, exist_ok=True)
train_script_path = os.path.join(model_save_path, "train_script.py")
copyfile(__file__, train_script_path)
with open(train_script_path, "a") as fOut:
fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
### Now we read the MS Marco dataset
data_folder = "msmarco-data"
#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {} # dict in the format: passage_id -> passage. Stores all existent passages
collection_filepath = os.path.join(data_folder, "collection.tsv")
if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
logging.info("Read corpus: collection.tsv")
with open(collection_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
pid, passage = line.strip().split("\t")
pid = int(pid)
corpus[pid] = passage
### Read the train queries, store in queries dict
queries = {} # dict in the format: query_id -> query. Stores all training queries
queries_filepath = os.path.join(data_folder, "queries.train.tsv")
if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(queries_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
qid = int(qid)
queries[qid] = query
# Load a dict (qid, pid) -> ce_score that maps query-ids (qid) and paragraph-ids (pid)
# to the CrossEncoder score computed by the cross-encoder/ms-marco-MiniLM-L-6-v2 model
ce_scores_file = os.path.join(data_folder, "cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz")
if not os.path.exists(ce_scores_file):
logging.info("Download cross-encoder scores file")
util.http_get(
"https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz",
ce_scores_file,
)
logging.info("Load CrossEncoder scores dict")
with gzip.open(ce_scores_file, "rb") as fIn:
ce_scores = pickle.load(fIn)
# As training data we use hard-negatives that have been mined using various systems
hard_negatives_filepath = os.path.join(data_folder, "msmarco-hard-negatives.jsonl.gz")
if not os.path.exists(hard_negatives_filepath):
logging.info("Download cross-encoder scores file")
util.http_get(
"https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/msmarco-hard-negatives.jsonl.gz",
hard_negatives_filepath,
)
logging.info("Read hard negatives train file")
train_queries = {}
negs_to_use = None
with gzip.open(hard_negatives_filepath, "rt") as fIn:
for line in tqdm.tqdm(fIn):
if max_passages > 0 and len(train_queries) >= max_passages:
break
data = json.loads(line)
# Get the positive passage ids
pos_pids = data["pos"]
# Get the hard negatives
neg_pids = set()
if negs_to_use is None:
if args.negs_to_use is not None: # Use specific system for negatives
negs_to_use = args.negs_to_use.split(",")
else: # Use all systems
negs_to_use = list(data["neg"].keys())
logging.info("Using negatives from the following systems:", negs_to_use)
for system_name in negs_to_use:
if system_name not in data["neg"]:
continue
system_negs = data["neg"][system_name]
negs_added = 0
for pid in system_negs:
if pid not in neg_pids:
neg_pids.add(pid)
negs_added += 1
if negs_added >= num_negs_per_system:
break
if args.use_all_queries or (len(pos_pids) > 0 and len(neg_pids) > 0):
train_queries[data["qid"]] = {
"qid": data["qid"],
"query": queries[data["qid"]],
"pos": pos_pids,
"neg": neg_pids,
}
logging.info("Train queries: {}".format(len(train_queries)))
# We create a custom MSMARCO dataset that returns triplets (query, positive, negative)
# on-the-fly based on the information from the mined-hard-negatives jsonl file.
class MSMARCODataset(Dataset):
def __init__(self, queries, corpus, ce_scores):
self.queries = queries
self.queries_ids = list(queries.keys())
self.corpus = corpus
self.ce_scores = ce_scores
for qid in self.queries:
self.queries[qid]["pos"] = list(self.queries[qid]["pos"])
self.queries[qid]["neg"] = list(self.queries[qid]["neg"])
random.shuffle(self.queries[qid]["neg"])
def __getitem__(self, item):
query = self.queries[self.queries_ids[item]]
query_text = query["query"]
qid = query["qid"]
if len(query["pos"]) > 0:
pos_id = query["pos"].pop(0) # Pop positive and add at end
pos_text = self.corpus[pos_id]
query["pos"].append(pos_id)
else: # We only have negatives, use two negs
pos_id = query["neg"].pop(0) # Pop negative and add at end
pos_text = self.corpus[pos_id]
query["neg"].append(pos_id)
# Get a negative passage
neg_id = query["neg"].pop(0) # Pop negative and add at end
neg_text = self.corpus[neg_id]
query["neg"].append(neg_id)
pos_score = self.ce_scores[qid][pos_id]
neg_score = self.ce_scores[qid][neg_id]
return InputExample(texts=[query_text, pos_text, neg_text], label=pos_score - neg_score)
def __len__(self):
return len(self.queries)
# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
train_dataset = MSMARCODataset(queries=train_queries, corpus=corpus, ce_scores=ce_scores)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, drop_last=True)
train_loss = losses.MarginMSELoss(model=model)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=num_epochs,
warmup_steps=args.warmup_steps,
use_amp=True,
checkpoint_path=model_save_path,
checkpoint_save_steps=10000,
optimizer_params={"lr": args.lr},
)
# Train latest model
model.save(model_save_path)
"""
This examples show how to train a Bi-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).
The queries and passages are passed independently to the transformer network to produce fixed sized embeddings.
These embeddings can then be compared using cosine-similarity to find matching passages for a given query.
For training, we use MultipleNegativesRankingLoss. There, we pass triplets in the format:
(query, positive_passage, negative_passage)
Negative passage are hard negative examples, that were mined using different dense embedding methods and lexical search methods.
Each positive and negative passage comes with a score from a Cross-Encoder. This allows denoising, i.e. removing false negative
passages that are actually relevant for the query.
With a distilbert-base-uncased model, it should achieve a performance of about 33.79 MRR@10 on the MSMARCO Passages Dev-Corpus
Running this script:
python train_bi-encoder-v3.py
"""
import json
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
from torch.utils.data import Dataset
import random
import pickle
import argparse
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
parser = argparse.ArgumentParser()
parser.add_argument("--train_batch_size", default=64, type=int)
parser.add_argument("--max_seq_length", default=300, type=int)
parser.add_argument("--model_name", required=True)
parser.add_argument("--max_passages", default=0, type=int)
parser.add_argument("--epochs", default=10, type=int)
parser.add_argument("--pooling", default="mean")
parser.add_argument(
"--negs_to_use",
default=None,
help="From which systems should negatives be used? Multiple systems separated by comma. None = all",
)
parser.add_argument("--warmup_steps", default=1000, type=int)
parser.add_argument("--lr", default=2e-5, type=float)
parser.add_argument("--num_negs_per_system", default=5, type=int)
parser.add_argument("--use_pre_trained_model", default=False, action="store_true")
parser.add_argument("--use_all_queries", default=False, action="store_true")
parser.add_argument("--ce_score_margin", default=3.0, type=float)
args = parser.parse_args()
print(args)
# The model we want to fine-tune
model_name = args.model_name
train_batch_size = (
args.train_batch_size
) # Increasing the train batch size improves the model performance, but requires more GPU memory
max_seq_length = args.max_seq_length # Max length for passages. Increasing it, requires more GPU memory
ce_score_margin = args.ce_score_margin # Margin for the CrossEncoder score between negative and positive passages
num_negs_per_system = (
args.num_negs_per_system
) # We used different systems to mine hard negatives. Number of hard negatives to add from each system
num_epochs = args.epochs # Number of epochs we want to train
# Load our embedding model
if args.use_pre_trained_model:
logging.info("use pretrained SBERT model")
model = SentenceTransformer(model_name)
model.max_seq_length = max_seq_length
else:
logging.info("Create new SBERT model")
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), args.pooling)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
model_save_path = "output/train_bi-encoder-mnrl-{}-margin_{:.1f}-{}".format(
model_name.replace("/", "-"), ce_score_margin, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
### Now we read the MS Marco dataset
data_folder = "msmarco-data"
#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {} # dict in the format: passage_id -> passage. Stores all existent passages
collection_filepath = os.path.join(data_folder, "collection.tsv")
if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
logging.info("Read corpus: collection.tsv")
with open(collection_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
pid, passage = line.strip().split("\t")
pid = int(pid)
corpus[pid] = passage
### Read the train queries, store in queries dict
queries = {} # dict in the format: query_id -> query. Stores all training queries
queries_filepath = os.path.join(data_folder, "queries.train.tsv")
if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(queries_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
qid = int(qid)
queries[qid] = query
# Load a dict (qid, pid) -> ce_score that maps query-ids (qid) and paragraph-ids (pid)
# to the CrossEncoder score computed by the cross-encoder/ms-marco-MiniLM-L-6-v2 model
ce_scores_file = os.path.join(data_folder, "cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz")
if not os.path.exists(ce_scores_file):
logging.info("Download cross-encoder scores file")
util.http_get(
"https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz",
ce_scores_file,
)
logging.info("Load CrossEncoder scores dict")
with gzip.open(ce_scores_file, "rb") as fIn:
ce_scores = pickle.load(fIn)
# As training data we use hard-negatives that have been mined using various systems
hard_negatives_filepath = os.path.join(data_folder, "msmarco-hard-negatives.jsonl.gz")
if not os.path.exists(hard_negatives_filepath):
logging.info("Download cross-encoder scores file")
util.http_get(
"https://huggingface.co/datasets/sentence-transformers/msmarco-hard-negatives/resolve/main/msmarco-hard-negatives.jsonl.gz",
hard_negatives_filepath,
)
logging.info("Read hard negatives train file")
train_queries = {}
negs_to_use = None
with gzip.open(hard_negatives_filepath, "rt") as fIn:
for line in tqdm.tqdm(fIn):
data = json.loads(line)
# Get the positive passage ids
qid = data["qid"]
pos_pids = data["pos"]
if len(pos_pids) == 0: # Skip entries without positives passages
continue
pos_min_ce_score = min([ce_scores[qid][pid] for pid in data["pos"]])
ce_score_threshold = pos_min_ce_score - ce_score_margin
# Get the hard negatives
neg_pids = set()
if negs_to_use is None:
if args.negs_to_use is not None: # Use specific system for negatives
negs_to_use = args.negs_to_use.split(",")
else: # Use all systems
negs_to_use = list(data["neg"].keys())
logging.info("Using negatives from the following systems: {}".format(", ".join(negs_to_use)))
for system_name in negs_to_use:
if system_name not in data["neg"]:
continue
system_negs = data["neg"][system_name]
negs_added = 0
for pid in system_negs:
if ce_scores[qid][pid] > ce_score_threshold:
continue
if pid not in neg_pids:
neg_pids.add(pid)
negs_added += 1
if negs_added >= num_negs_per_system:
break
if args.use_all_queries or (len(pos_pids) > 0 and len(neg_pids) > 0):
train_queries[data["qid"]] = {
"qid": data["qid"],
"query": queries[data["qid"]],
"pos": pos_pids,
"neg": neg_pids,
}
del ce_scores
logging.info("Train queries: {}".format(len(train_queries)))
# We create a custom MSMARCO dataset that returns triplets (query, positive, negative)
# on-the-fly based on the information from the mined-hard-negatives jsonl file.
class MSMARCODataset(Dataset):
def __init__(self, queries, corpus):
self.queries = queries
self.queries_ids = list(queries.keys())
self.corpus = corpus
for qid in self.queries:
self.queries[qid]["pos"] = list(self.queries[qid]["pos"])
self.queries[qid]["neg"] = list(self.queries[qid]["neg"])
random.shuffle(self.queries[qid]["neg"])
def __getitem__(self, item):
query = self.queries[self.queries_ids[item]]
query_text = query["query"]
pos_id = query["pos"].pop(0) # Pop positive and add at end
pos_text = self.corpus[pos_id]
query["pos"].append(pos_id)
neg_id = query["neg"].pop(0) # Pop negative and add at end
neg_text = self.corpus[neg_id]
query["neg"].append(neg_id)
return InputExample(texts=[query_text, pos_text, neg_text])
def __len__(self):
return len(self.queries)
# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
train_dataset = MSMARCODataset(train_queries, corpus=corpus)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=num_epochs,
warmup_steps=args.warmup_steps,
use_amp=True,
checkpoint_path=model_save_path,
checkpoint_save_steps=len(train_dataloader),
optimizer_params={"lr": args.lr},
)
# Save the model
model.save(model_save_path)
"""
This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).
In this example we use a knowledge distillation setup. Sebastian Hofstätter et al. trained in https://arxiv.org/abs/2010.02666
an ensemble of large Transformer models for the MS MARCO datasets and combines the scores from a BERT-base, BERT-large, and ALBERT-large model.
We use the logits scores from the ensemble to train a smaller model. We found that the MiniLM model gives the best performance while
offering the highest speed.
The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
for a given query, for example with Elasticsearch, and pass the query+retrieved_passage to the CrossEncoder
for scoring. You sort the results then according to the output of the CrossEncoder.
This gives a significant boost compared to out-of-the-box Elasticsearch / BM25 ranking.
Running this script:
python train_cross-encoder-v2.py
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
import torch
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# First, we define the transformer model we want to fine-tune
model_name = "microsoft/MiniLM-L12-H384-uncased"
train_batch_size = 32
num_epochs = 1
model_save_path = (
"output/training_ms-marco_cross-encoder-v2-"
+ model_name.replace("/", "-")
+ "-"
+ datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# We set num_labels=1 and set the activation function to Identity, so that we get the raw logits
model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity())
### Now we read the MS Marco dataset
data_folder = "msmarco-data"
os.makedirs(data_folder, exist_ok=True)
#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, "collection.tsv")
if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(collection_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
pid, passage = line.strip().split("\t")
corpus[pid] = passage
### Read the train queries, store in queries dict
queries = {}
queries_filepath = os.path.join(data_folder, "queries.train.tsv")
if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(queries_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
queries[qid] = query
### Now we create our dev data
train_samples = []
dev_samples = {}
# We use 200 random queries from the train set for evaluation during training
# Each query has at least one relevant and up to 200 irrelevant (negative) passages
num_dev_queries = 200
num_max_dev_negatives = 200
# msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz and msmarco-qidpidtriples.rnd-shuf.train.tsv.gz is a randomly
# shuffled version of qidpidtriples.train.full.2.tsv.gz from the MS Marco website
# We extracted in the train-eval split 500 random queries that can be used for evaluation during training
train_eval_filepath = os.path.join(data_folder, "msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz")
if not os.path.exists(train_eval_filepath):
logging.info("Download " + os.path.basename(train_eval_filepath))
util.http_get("https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz", train_eval_filepath)
with gzip.open(train_eval_filepath, "rt") as fIn:
for line in fIn:
qid, pos_id, neg_id = line.strip().split()
if qid not in dev_samples and len(dev_samples) < num_dev_queries:
dev_samples[qid] = {"query": queries[qid], "positive": set(), "negative": set()}
if qid in dev_samples:
dev_samples[qid]["positive"].add(corpus[pos_id])
if len(dev_samples[qid]["negative"]) < num_max_dev_negatives:
dev_samples[qid]["negative"].add(corpus[neg_id])
dev_qids = set(dev_samples.keys())
# Read our training file
# As input examples, we provide the (query, passage) pair together with the logits score from the teacher ensemble
teacher_logits_filepath = os.path.join(data_folder, "bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv")
train_samples = []
if not os.path.exists(teacher_logits_filepath):
util.http_get(
"https://zenodo.org/record/4068216/files/bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1",
teacher_logits_filepath,
)
with open(teacher_logits_filepath) as fIn:
for line in fIn:
pos_score, neg_score, qid, pid1, pid2 = line.strip().split("\t")
if qid in dev_qids: # Skip queries in our dev dataset
continue
train_samples.append(InputExample(texts=[queries[qid], corpus[pid1]], label=float(pos_score)))
train_samples.append(InputExample(texts=[queries[qid], corpus[pid2]], label=float(neg_score)))
# We create a DataLoader to load our train samples
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
# We add an evaluator, which evaluates the performance during training
# It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision
evaluator = CERerankingEvaluator(dev_samples, name="train-eval")
# Configure the training
warmup_steps = 5000
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_dataloader=train_dataloader,
loss_fct=torch.nn.MSELoss(),
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=5000,
warmup_steps=warmup_steps,
output_path=model_save_path,
optimizer_params={"lr": 7e-6},
use_amp=True,
)
# Save latest model
model.save(model_save_path + "-latest")
"""
This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).
The query and the passage are passed simoultanously to a Transformer network. The network then returns
a score between 0 and 1 how relevant the passage is for a given query.
The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
for a given query, for example with Elasticsearch, and pass the query+retrieved_passage to the CrossEncoder
for scoring. You sort the results then according to the output of the CrossEncoder.
This gives a significant boost compared to out-of-the-box Elasticsearch / BM25 ranking.
Running this script:
python train_cross-encoder.py
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# First, we define the transformer model we want to fine-tune
model_name = "distilroberta-base"
train_batch_size = 32
num_epochs = 1
model_save_path = (
"output/training_ms-marco_cross-encoder-"
+ model_name.replace("/", "-")
+ "-"
+ datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# We train the network with as a binary label task
# Given [query, passage] is the label 0 = irrelevant or 1 = relevant?
# We use a positive-to-negative ratio: For 1 positive sample (label 1) we include 4 negative samples (label 0)
# in our training setup. For the negative samples, we use the triplets provided by MS Marco that
# specify (query, positive sample, negative sample).
pos_neg_ration = 4
# Maximal number of training samples we want to use
max_train_samples = 2e7
# We set num_labels=1, which predicts a continuous score between 0 and 1
model = CrossEncoder(model_name, num_labels=1, max_length=512)
### Now we read the MS Marco dataset
data_folder = "msmarco-data"
os.makedirs(data_folder, exist_ok=True)
#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, "collection.tsv")
if not os.path.exists(collection_filepath):
tar_filepath = os.path.join(data_folder, "collection.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download collection.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(collection_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
pid, passage = line.strip().split("\t")
corpus[pid] = passage
### Read the train queries, store in queries dict
queries = {}
queries_filepath = os.path.join(data_folder, "queries.train.tsv")
if not os.path.exists(queries_filepath):
tar_filepath = os.path.join(data_folder, "queries.tar.gz")
if not os.path.exists(tar_filepath):
logging.info("Download queries.tar.gz")
util.http_get("https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz", tar_filepath)
with tarfile.open(tar_filepath, "r:gz") as tar:
tar.extractall(path=data_folder)
with open(queries_filepath, "r", encoding="utf8") as fIn:
for line in fIn:
qid, query = line.strip().split("\t")
queries[qid] = query
### Now we create our training & dev data
train_samples = []
dev_samples = {}
# We use 200 random queries from the train set for evaluation during training
# Each query has at least one relevant and up to 200 irrelevant (negative) passages
num_dev_queries = 200
num_max_dev_negatives = 200
# msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz and msmarco-qidpidtriples.rnd-shuf.train.tsv.gz is a randomly
# shuffled version of qidpidtriples.train.full.2.tsv.gz from the MS Marco website
# We extracted in the train-eval split 500 random queries that can be used for evaluation during training
train_eval_filepath = os.path.join(data_folder, "msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz")
if not os.path.exists(train_eval_filepath):
logging.info("Download " + os.path.basename(train_eval_filepath))
util.http_get("https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train-eval.tsv.gz", train_eval_filepath)
with gzip.open(train_eval_filepath, "rt") as fIn:
for line in fIn:
qid, pos_id, neg_id = line.strip().split()
if qid not in dev_samples and len(dev_samples) < num_dev_queries:
dev_samples[qid] = {"query": queries[qid], "positive": set(), "negative": set()}
if qid in dev_samples:
dev_samples[qid]["positive"].add(corpus[pos_id])
if len(dev_samples[qid]["negative"]) < num_max_dev_negatives:
dev_samples[qid]["negative"].add(corpus[neg_id])
# Read our training file
train_filepath = os.path.join(data_folder, "msmarco-qidpidtriples.rnd-shuf.train.tsv.gz")
if not os.path.exists(train_filepath):
logging.info("Download " + os.path.basename(train_filepath))
util.http_get("https://sbert.net/datasets/msmarco-qidpidtriples.rnd-shuf.train.tsv.gz", train_filepath)
cnt = 0
with gzip.open(train_filepath, "rt") as fIn:
for line in tqdm.tqdm(fIn, unit_scale=True):
qid, pos_id, neg_id = line.strip().split()
if qid in dev_samples:
continue
query = queries[qid]
if (cnt % (pos_neg_ration + 1)) == 0:
passage = corpus[pos_id]
label = 1
else:
passage = corpus[neg_id]
label = 0
train_samples.append(InputExample(texts=[query, passage], label=label))
cnt += 1
if cnt >= max_train_samples:
break
# We create a DataLoader to load our train samples
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
# We add an evaluator, which evaluates the performance during training
# It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision
evaluator = CERerankingEvaluator(dev_samples, name="train-eval")
# Configure the training
warmup_steps = 5000
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_dataloader=train_dataloader,
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=10000,
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=True,
)
# Save latest model
model.save(model_save_path + "-latest")
# Multilingual-Models
The issue with multilingual BERT (mBERT) as well as with XLM-RoBERTa is that those produce rather bad sentence representation out-of-the-box. Further, the vectors spaces between languages are not aligned, i.e., the sentences with the same content in different languages would be mapped to different locations in the vector space.
In my publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) I describe any easy approach to extend sentence embeddings to further languages.
Chien Vu also wrote a nice blog article on this technique: [A complete guide to transfer learning from English to other Languages using Sentence Embeddings BERT Models](https://towardsdatascience.com/a-complete-guide-to-transfer-learning-from-english-to-other-languages-using-sentence-embeddings-8c427f8804a9)
## Available Pre-trained Models
For a list of available models, see [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models).
## Usage
You can use the models in the following way:
```python
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("model-name")
embeddings = embedder.encode(["Hello World", "Hallo Welt", "Hola mundo"])
print(embeddings)
```
## Performance
The performance was evaluated on the [Semantic Textual Similarity (STS) 2017 dataset](http://ixa2.si.ehu.es/stswiki/index.php/Main_Page). The task is to predict the semantic similarity (on a scale 0-5) of two given sentences. STS2017 has monolingual test data for English, Arabic, and Spanish, and cross-lingual test data for English-Arabic, -Spanish and -Turkish.
We extended the STS2017 and added cross-lingual test data for English-German, French-English, Italian-English, and Dutch-English ([STS2017-extended.zip](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/STS2017-extended.zip)). The performance is measured using Spearman correlation between the predicted similarity score and the gold score.
<table class="docutils">
<tr>
<th>Model</th>
<th>AR-AR</th>
<th>AR-EN</th>
<th>ES-ES</th>
<th>ES-EN</th>
<th>EN-EN</th>
<th>TR-EN</th>
<th>EN-DE</th>
<th>FR-EN</th>
<th>IT-EN</th>
<th>NL-EN</th>
<th>Average</th>
</tr>
<tr>
<td>XLM-RoBERTa mean pooling </td>
<td align="center">25.7</td>
<td align="center">17.4</td>
<td align="center">51.8</td>
<td align="center">10.9</td>
<td align="center">50.7</td>
<td align="center">9.2</td>
<td align="center">21.3</td>
<td align="center">16.6</td>
<td align="center">22.9</td>
<td align="center">26.0</td>
<td align="center">25.2</td>
</tr>
<tr>
<td>mBERT mean pooling </td>
<td align="center">50.9</td>
<td align="center">16.7</td>
<td align="center">56.7</td>
<td align="center">21.5</td>
<td align="center">54.4</td>
<td align="center">16.0</td>
<td align="center">33.9</td>
<td align="center">33.0</td>
<td align="center">34.0</td>
<td align="center">35.6</td>
<td align="center">35.3</td>
</tr>
<tr>
<td>LASER</td>
<td align="center">68.9</td>
<td align="center">66.5</td>
<td align="center">79.7</td>
<td align="center">57.9</td>
<td align="center">77.6</td>
<td align="center">72.0</td>
<td align="center">64.2</td>
<td align="center">69.1</td>
<td align="center">70.8</td>
<td align="center">68.5</td>
<td align="center">69.5</td>
</tr>
<tr>
<td colspan="12"><b>Sentence Transformer Models</b></td>
</tr>
<tr>
<td>distiluse-base-multilingual-cased</td>
<td align="center">75.9</td>
<td align="center">77.6</td>
<td align="center">85.3</td>
<td align="center">78.7</td>
<td align="center">85.4</td>
<td align="center">75.5</td>
<td align="center">80.3</td>
<td align="center">80.2</td>
<td align="center">80.5</td>
<td align="center">81.7</td>
<td align="center">80.1</td>
</tr>
</table>
## Extend your own models
![Multilingual Knowledge Distillation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/multilingual-distillation.png)
The idea is based on a fixed (monolingual) **teacher model**, that produces sentence embeddings with our desired properties in one language. The **student model** is supposed to mimic the teacher model, i.e., the same English sentence should be mapped to the same vector by the teacher and by the student model. In order that the student model works for further languages, we train the student model on parallel (translated) sentences. The translation of each sentence should also be mapped to the same vector as the original sentence.
In the above figure, the student model should map *Hello World* and the German translation *Hallo Welt* to the vector of *teacher_model('Hello World')*. We achieve this by training the student model using mean squared error (MSE) loss.
In our experiments we initialized the student model with the multilingual XLM-RoBERTa model.
## Training
For a **fully automatic code example**, see [make_multilingual.py](make_multilingual.py).
This scripts downloads the parallel sentences corpus, a corpus with transcripts and translations from talks. It than extends a monolingual model to several languages (en, de, es, it, fr, ar, tr). This corpus contains parallel data for more than 100 languages, hence, you can simple change the script and train a multilingual model in your favorite languages.
## Data Format
As training data we require parallel sentences, i.e., sentences translated in various languages. As data format, we use a tab-separated .tsv file. In the first column, you have your source sentence, for example, an English sentence. In the following columns, you have the translations of this source sentence. If you have multiple translations per source sentence, you can put them in the same line or in different lines.
```
Source_sentence Target_lang1 Target_lang2 Target_lang3
Source_sentence Target_lang1 Target_lang2
```
An example file could look like this (EN DE ES):
```
Hello World Hallo Welt Hola Mundo
Sentences are separated with a tab character. Die Sätze sind per Tab getrennt. Las oraciones se separan con un carácter de tabulación.
```
The order of the translations are not important, it is only important that the first column contains a sentence in a language that is understood by the teacher model.
## Loading Training Datasets
You can load such a training file using the *ParallelSentencesDataset* class:
```python
from sentence_transformers.datasets import ParallelSentencesDataset
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model)
train_data.load_data("path/to/tab/separated/train-en-de.tsv")
train_data.load_data("path/to/tab/separated/train-en-es.tsv.gz")
train_data.load_data("path/to/tab/separated/train-en-fr.tsv.gz")
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)
```
You load a file with the *load_data()* method. You can load multiple files by calling load_data multiple times. You can also regular files or .gz-compressed files.
Per default, all datasets are weighted equally. In the above example a (source, translation)-pair will be sampled equally from all three datasets. If you pass a `weight` parameter (integer), you can weight some datasets higher or lower.
## Sources for Training Data
A great website for a vast number of parallel (translated) datasets is [OPUS](http://opus.nlpl.eu/). There, you find parallel datasets for more than 400 languages.
The [examples/training/multilingual](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/) folder contains some scripts that downloads parallel training data and brings it into the right format:
- [get_parallel_data_opus.py](get_parallel_data_opus.py): This script downloads data from the [OPUS](http://opus.nlpl.eu/) website.
- [get_parallel_data_tatoeba.py](get_parallel_data_tatoeba.py): This script downloads data from the [Tatoeba](https://tatoeba.org/) website, a website for language learners with example sentences for more than many languages.
- [get_parallel_data_talks.py](get_parallel_data_talks.py): This script downloads data the parallel sentences corpus, which contains transcripts and translations of more than 4,000 talks in 100+ languages.
## Evaluation
Training can be evaluated in different ways. For an example how to use these evaluation methods, see [make_multilingual.py](make_multilingual.py).
### MSE Evaluation
You can measure the mean squared error (MSE) between the student embeddings and teacher embeddings. This can be achieved with the ``
```python
# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model)
```
This evaluator computes the teacher embeddings for the `src_sentences`, for example, for English. During training, the student model is used to compute embeddings for the `trg_sentences`, for example, for Spanish. The distance between teacher and student embeddings is measures. Lower scores indicate a better performance.
### Translation Accuracy
You can also measure the translation accuracy. Given a list with source sentences, for example, 1000 English sentences. And a list with matching target (translated) sentences, for example, 1000 Spanish sentences.
For each sentence pair, we check if their embeddings are the closest using cosine similarity. I.e., for each `src_sentences[i]` we check if `trg_sentences[i]` has the highest similarity out of all target sentences. If this is the case, we have a hit, otherwise an error. This evaluator reports accuracy (higher = better).
```python
# src_sentences and trg_sentences are lists of translated sentences, such that trg_sentences[i] is the translation of src_sentences[i]
dev_trans_acc = evaluation.TranslationEvaluator(
src_sentences,
trg_sentences,
name=os.path.basename(dev_file),
batch_size=inference_batch_size,
)
```
### Multi-Lingual Semantic Textual Similarity
You can also measure the semantic textual similarity (STS) between sentence pairs in different languages:
```python
sts_evaluator = evaluation.EmbeddingSimilarityEvaluatorFromList(sentences1, sentences2, scores)
```
Where `sentences1` and `sentences2` are lists of sentences and score is numeric value indicating the semantic similarity between `sentences1[i]` and `sentences2[i]`.
## Citation
If you use the code for multilingual models, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813):
```
@article{reimers-2020-multilingual-sentence-bert,
title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
author = "Reimers, Nils and Gurevych, Iryna",
journal= "arXiv preprint arXiv:2004.09813",
month = "04",
year = "2020",
url = "http://arxiv.org/abs/2004.09813",
}
```
"""
OPUS (http://opus.nlpl.eu/) is a great collection of different parallel datasets for more than 400 languages.
On the website, you can download parallel datasets for many languages in different formats. I found that
the format "Bottom-left triangle: download plain text files (MOSES/GIZA++)" requires minimal
overhead for post-processing to get it into a suitable format for this library.
You can use the OPUS dataset to create multilingual sentence embeddings. This script contains code to download
OPUS datasets for the desired languages and to create training files in the right format.
1) First, you need to install OpusTools (https://github.com/Helsinki-NLP/OpusTools/tree/master/opustools_pkg):
pip install opustools
2) Once you have OpusTools installed, you can download data in the right format via:
mkdir parallel-sentences
opus_read -d [CORPUS] -s [SRC_LANG] -t [TRG_LANG] --write parallel-sentences/[FILENAME].tsv.gz -wm moses -dl opus -p raw
For example:
mkdir parallel-sentences
opus_read -d JW300 -s en -t de --write parallel-sentences/JW300-en-de.tsv.gz -wm moses -dl opus -p raw
This downloads the JW300 Corpus (http://opus.nlpl.eu/JW300.php) for English (en) and German (de) and write the output to
parallel-sentences/JW300-en-de.tsv.gz
####################
This python code automates the download and creation of the parallel sentences files.
"""
from opustools import OpusRead
import os
corpora = ["JW300"] # Corpora you want to use
source_languages = ["en"] # Source language, our teacher model is able to understand
target_languages = ["de", "es", "it", "fr", "ar", "tr"] # Target languages, out student model should learn
output_folder = "parallel-sentences"
opus_download_folder = "./opus"
# Iterator over all corpora / source languages / target languages combinations and download files
os.makedirs(output_folder, exist_ok=True)
for corpus in corpora:
for src_lang in source_languages:
for trg_lang in target_languages:
output_filename = os.path.join(output_folder, "{}-{}-{}.tsv.gz".format(corpus, src_lang, trg_lang))
if not os.path.exists(output_filename):
print("Create:", output_filename)
try:
read = OpusRead(
directory=corpus,
source=src_lang,
target=trg_lang,
write=[output_filename],
download_dir=opus_download_folder,
preprocess="raw",
write_mode="moses",
suppress_prompts=True,
)
read.printPairs()
except Exception:
print("An error occurred during the creation of", output_filename)
"""
This script downloads the parallel sentences corpus and create parallel sentences tsv files that can be used to extend
existent sentence embedding models to new languages.
The parallel sentences corpus is a crawl of transcripts from talks, which are translated to 100+ languages.
The parallel sentences corpus cannot be downloaded automatically. It is available for research purposes only (CC-BY-NC).
The training procedure can be found in the files make_multilingual.py and make_multilingual_sys.py.
Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
import os
import sentence_transformers.util
import gzip
import csv
from tqdm.autonotebook import tqdm
source_languages = set(["en"]) # Languages our (monolingual) teacher model understands
target_languages = set(["de", "es", "it", "fr", "ar", "tr"]) # New languages we want to extend to
dev_sentences = 1000 # Number of sentences we want to use for development
download_url = "https://sbert.net/datasets/parallel-sentences.tsv.gz" # Specify parallel sentences URL here
parallel_sentences_path = "../datasets/parallel-sentences.tsv.gz" # Path of the parallel-sentences.tsv.gz file.
parallel_sentences_folder = "parallel-sentences/"
os.makedirs(os.path.dirname(parallel_sentences_path), exist_ok=True)
if not os.path.exists(parallel_sentences_path):
print("parallel-sentences.tsv.gz does not exists. Try to download from server")
sentence_transformers.util.http_get(download_url, parallel_sentences_path)
os.makedirs(parallel_sentences_folder, exist_ok=True)
train_files = []
dev_files = []
files_to_create = []
for source_lang in source_languages:
for target_lang in target_languages:
output_filename_train = os.path.join(
parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
)
output_filename_dev = os.path.join(
parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
)
train_files.append(output_filename_train)
dev_files.append(output_filename_dev)
if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
files_to_create.append(
{
"src_lang": source_lang,
"trg_lang": target_lang,
"fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
"fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
"devCount": 0,
}
)
if len(files_to_create) > 0:
print(
"Parallel sentences files {} do not exist. Create these files now".format(
", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
)
)
with gzip.open(parallel_sentences_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for line in tqdm(reader, desc="Sentences"):
for outfile in files_to_create:
src_text = line[outfile["src_lang"]].strip()
trg_text = line[outfile["trg_lang"]].strip()
if src_text != "" and trg_text != "":
if outfile["devCount"] < dev_sentences:
outfile["devCount"] += 1
fOut = outfile["fDev"]
else:
fOut = outfile["fTrain"]
fOut.write("{}\t{}\n".format(src_text, trg_text))
for outfile in files_to_create:
outfile["fTrain"].close()
outfile["fDev"].close()
print("---DONE---")
"""
Tatoeba (https://tatoeba.org/) is a collection of sentences and translation, mainly aiming for language learning.
It is available for more than 300 languages.
This script downloads the Tatoeba corpus and extracts the sentences & translations in the languages you like
"""
import os
import sentence_transformers
import tarfile
import gzip
# Note: Tatoeba uses 3 letter languages codes (ISO-639-2),
# while other datasets like OPUS use 2 letter language codes (ISO-639-1)
# For training of sentence transformers, which type of language code is used doesn't matter.
# For language codes, see: https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes
source_languages = set(["eng"])
target_languages = set(["deu", "ara", "tur", "spa", "ita", "fra"])
num_dev_sentences = 1000 # Number of sentences that are used to create a development set
tatoeba_folder = "../datasets/tatoeba"
output_folder = "parallel-sentences/"
sentences_file_bz2 = os.path.join(tatoeba_folder, "sentences.tar.bz2")
sentences_file = os.path.join(tatoeba_folder, "sentences.csv")
links_file_bz2 = os.path.join(tatoeba_folder, "links.tar.bz2")
links_file = os.path.join(tatoeba_folder, "links.csv")
download_url = "https://downloads.tatoeba.org/exports/"
os.makedirs(tatoeba_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)
# Download files if needed
for filepath in [sentences_file_bz2, links_file_bz2]:
if not os.path.exists(filepath):
url = download_url + os.path.basename(filepath)
print("Download", url)
sentence_transformers.util.http_get(url, filepath)
# Extract files if needed
if not os.path.exists(sentences_file):
print("Extract", sentences_file_bz2)
tar = tarfile.open(sentences_file_bz2, "r:bz2")
tar.extract("sentences.csv", path=tatoeba_folder)
tar.close()
if not os.path.exists(links_file):
print("Extract", links_file_bz2)
tar = tarfile.open(links_file_bz2, "r:bz2")
tar.extract("links.csv", path=tatoeba_folder)
tar.close()
# Read sentences
sentences = {}
all_langs = target_languages.union(source_languages)
print("Read sentences.csv file")
with open(sentences_file, encoding="utf8") as fIn:
for line in fIn:
id, lang, sentence = line.strip().split("\t")
if lang in all_langs:
sentences[id] = (lang, sentence)
# Read links that map the translations between different languages
print("Read links.csv")
translations = {src_lang: {trg_lang: {} for trg_lang in target_languages} for src_lang in source_languages}
with open(links_file, encoding="utf8") as fIn:
for line in fIn:
src_id, target_id = line.strip().split()
if src_id in sentences and target_id in sentences:
src_lang, src_sent = sentences[src_id]
trg_lang, trg_sent = sentences[target_id]
if src_lang in source_languages and trg_lang in target_languages:
if src_sent not in translations[src_lang][trg_lang]:
translations[src_lang][trg_lang][src_sent] = []
translations[src_lang][trg_lang][src_sent].append(trg_sent)
# Write everything to the output folder
print("Write output files")
for src_lang in source_languages:
for trg_lang in target_languages:
source_sentences = list(translations[src_lang][trg_lang])
train_sentences = source_sentences[num_dev_sentences:]
dev_sentences = source_sentences[0:num_dev_sentences]
print("{}-{} has {} sentences".format(src_lang, trg_lang, len(source_sentences)))
if len(dev_sentences) > 0:
with gzip.open(
os.path.join(output_folder, "Tatoeba-{}-{}-dev.tsv.gz".format(src_lang, trg_lang)),
"wt",
encoding="utf8",
) as fOut:
for sent in dev_sentences:
fOut.write("\t".join([sent] + translations[src_lang][trg_lang][sent]))
fOut.write("\n")
if len(train_sentences) > 0:
with gzip.open(
os.path.join(output_folder, "Tatoeba-{}-{}-train.tsv.gz".format(src_lang, trg_lang)),
"wt",
encoding="utf8",
) as fOut:
for sent in train_sentences:
fOut.write("\t".join([sent] + translations[src_lang][trg_lang][sent]))
fOut.write("\n")
print("---DONE---")
"""
This script downloads the WikiMatrix corpus (https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix)
and create parallel sentences tsv files that can be used to extend existent sentence embedding models to new languages.
The WikiMatrix mined parallel sentences from Wikipedia in various languages.
Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
import os
import sentence_transformers.util
import gzip
source_languages = set(["en"]) # Languages our (monolingual) teacher model understands
target_languages = set(["de", "es", "it", "fr", "ar", "tr"]) # New languages we want to extend to
num_dev_sentences = 1000 # Number of sentences we want to use for development
threshold = 1.075 # Only use sentences with a LASER similarity score above the threshold
download_url = "https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/"
download_folder = "../datasets/WikiMatrix/"
parallel_sentences_folder = "parallel-sentences/"
os.makedirs(os.path.dirname(download_folder), exist_ok=True)
os.makedirs(parallel_sentences_folder, exist_ok=True)
for source_lang in source_languages:
for target_lang in target_languages:
filename_train = os.path.join(
parallel_sentences_folder, "WikiMatrix-{}-{}-train.tsv.gz".format(source_lang, target_lang)
)
filename_dev = os.path.join(
parallel_sentences_folder, "WikiMatrix-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
)
if not os.path.exists(filename_train) and not os.path.exists(filename_dev):
langs_ordered = sorted([source_lang, target_lang])
wikimatrix_filename = "WikiMatrix.{}-{}.tsv.gz".format(*langs_ordered)
wikimatrix_filepath = os.path.join(download_folder, wikimatrix_filename)
if not os.path.exists(wikimatrix_filepath):
print("Download", download_url + wikimatrix_filename)
try:
sentence_transformers.util.http_get(download_url + wikimatrix_filename, wikimatrix_filepath)
except Exception:
print("Was not able to download", download_url + wikimatrix_filename)
continue
if not os.path.exists(wikimatrix_filepath):
continue
train_sentences = []
dev_sentences = []
dev_sentences_set = set()
extract_dev_sentences = True
with gzip.open(wikimatrix_filepath, "rt", encoding="utf8") as fIn:
for line in fIn:
score, sent1, sent2 = line.strip().split("\t")
sent1 = sent1.strip()
sent2 = sent2.strip()
score = float(score)
if score < threshold:
break
if sent1 == sent2:
continue
if langs_ordered.index(source_lang) == 1: # Swap, so that src lang is sent1
sent1, sent2 = sent2, sent1
# Avoid duplicates in development set
if sent1 in dev_sentences_set or sent2 in dev_sentences_set:
continue
if extract_dev_sentences:
dev_sentences.append([sent1, sent2])
dev_sentences_set.add(sent1)
dev_sentences_set.add(sent2)
if len(dev_sentences) >= num_dev_sentences:
extract_dev_sentences = False
else:
train_sentences.append([sent1, sent2])
print("Write", len(dev_sentences), "dev sentences", filename_dev)
with gzip.open(filename_dev, "wt", encoding="utf8") as fOut:
for sents in dev_sentences:
fOut.write("\t".join(sents))
fOut.write("\n")
print("Write", len(train_sentences), "train sentences", filename_train)
with gzip.open(filename_train, "wt", encoding="utf8") as fOut:
for sents in train_sentences:
fOut.write("\t".join(sents))
fOut.write("\n")
print("---DONE---")
"""
This script contains an example how to extend an existent sentence embedding model to new languages.
Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
on multiple languages.
For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
with the first column a sentence in a language understood by the teacher model, e.g. English,
and the further columns contain the according translations for languages you want to extend to.
This scripts downloads automatically the parallel sentences corpus. This corpus contains transcripts from
talks translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts
Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime
import os
import logging
import sentence_transformers.util
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import zipfile
import io
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
teacher_model_name = (
"paraphrase-distilroberta-base-v2" # Our monolingual teacher model, we want to convert to multiple languages
)
student_model_name = "xlm-roberta-base" # Multilingual base model we use to imitate the teacher model
max_seq_length = 128 # Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64 # Batch size for training
inference_batch_size = 64 # Batch size at inference
max_sentences_per_language = 500000 # Maximum number of parallel sentences for training
train_max_sentence_length = 250 # Maximum length (characters) for parallel training sentences
num_epochs = 5 # Train for x epochs
num_warmup_steps = 10000 # Warumup steps
num_evaluation_steps = 1000 # Evaluate performance after every xxxx steps
dev_sentences = 1000 # Number of parallel sentences to be used for development
# Define the language codes you would like to extend the model to
source_languages = set(["en"]) # Our teacher model accepts English (en) sentences
target_languages = set(
["de", "es", "it", "fr", "ar", "tr"]
) # We want to extend the model to these new languages. For language codes, see the header of the train file
output_path = (
"output/make-multilingual-"
+ "-".join(sorted(list(source_languages)) + sorted(list(target_languages)))
+ "-"
+ datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# This function downloads a corpus if it does not exist
def download_corpora(filepaths):
if not isinstance(filepaths, list):
filepaths = [filepaths]
for filepath in filepaths:
if not os.path.exists(filepath):
print(filepath, "does not exists. Try to download from server")
filename = os.path.basename(filepath)
url = "https://sbert.net/datasets/" + filename
sentence_transformers.util.http_get(url, filepath)
# Here we define train train and dev corpora
train_corpus = "datasets/parallel-sentences.tsv.gz"
sts_corpus = "datasets/stsbenchmark.zip"
parallel_sentences_folder = "parallel-sentences/"
# Check if the file exists. If not, they are downloaded
download_corpora([train_corpus, sts_corpus])
# Create parallel files for the selected language combinations
os.makedirs(parallel_sentences_folder, exist_ok=True)
train_files = []
dev_files = []
files_to_create = []
for source_lang in source_languages:
for target_lang in target_languages:
output_filename_train = os.path.join(
parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
)
output_filename_dev = os.path.join(
parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
)
train_files.append(output_filename_train)
dev_files.append(output_filename_dev)
if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
files_to_create.append(
{
"src_lang": source_lang,
"trg_lang": target_lang,
"fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
"fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
"devCount": 0,
}
)
if len(files_to_create) > 0:
print(
"Parallel sentences files {} do not exist. Create these files now".format(
", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
)
)
with gzip.open(train_corpus, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for line in tqdm(reader, desc="Sentences"):
for outfile in files_to_create:
src_text = line[outfile["src_lang"]].strip()
trg_text = line[outfile["trg_lang"]].strip()
if src_text != "" and trg_text != "":
if outfile["devCount"] < dev_sentences:
outfile["devCount"] += 1
fOut = outfile["fDev"]
else:
fOut = outfile["fTrain"]
fOut.write("{}\t{}\n".format(src_text, trg_text))
for outfile in files_to_create:
outfile["fTrain"].close()
outfile["fDev"].close()
######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)
logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(
student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
)
for train_file in train_files:
train_data.load_data(
train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length
)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)
#### Evaluate cross-lingual performance on different tasks #####
evaluators = [] # evaluators has a list of different evaluator classes we call periodically
for dev_file in dev_files:
logger.info("Create evaluator for " + dev_file)
src_sentences = []
trg_sentences = []
with gzip.open(dev_file, "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
if splits[0] != "" and splits[1] != "":
src_sentences.append(splits[0])
trg_sentences.append(splits[1])
# Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
dev_mse = evaluation.MSEEvaluator(
src_sentences,
trg_sentences,
name=os.path.basename(dev_file),
teacher_model=teacher_model,
batch_size=inference_batch_size,
)
evaluators.append(dev_mse)
# TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
dev_trans_acc = evaluation.TranslationEvaluator(
src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
)
evaluators.append(dev_trans_acc)
##### Read cross-lingual Semantic Textual Similarity (STS) data ####
all_languages = list(set(list(source_languages) + list(target_languages)))
sts_data = {}
# Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
with zipfile.ZipFile(sts_corpus) as zip:
filelist = zip.namelist()
sts_files = []
for i in range(len(all_languages)):
for j in range(i, len(all_languages)):
lang1 = all_languages[i]
lang2 = all_languages[j]
filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
if filepath not in filelist:
lang1, lang2 = lang2, lang1
filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
if filepath in filelist:
filename = os.path.basename(filepath)
sts_data[filename] = {"sentences1": [], "sentences2": [], "scores": []}
fIn = zip.open(filepath)
for line in io.TextIOWrapper(fIn, "utf8"):
sent1, sent2, score = line.strip().split("\t")
score = float(score)
sts_data[filename]["sentences1"].append(sent1)
sts_data[filename]["sentences2"].append(sent2)
sts_data[filename]["scores"].append(score)
for filename, data in sts_data.items():
test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
data["sentences1"],
data["sentences2"],
data["scores"],
batch_size=inference_batch_size,
name=filename,
show_progress_bar=False,
)
evaluators.append(test_evaluator)
# Train the model
student_model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
epochs=num_epochs,
warmup_steps=num_warmup_steps,
evaluation_steps=num_evaluation_steps,
output_path=output_path,
save_best_model=True,
optimizer_params={"lr": 2e-5, "eps": 1e-6},
)
"""
This script contains an example how to extend an existent sentence embedding model to new languages.
Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
on multiple languages.
For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
with the first column a sentence in a language understood by the teacher model, e.g. English,
and the further columns contain the according translations for languages you want to extend to.
See get_parallel_data_[opus/tatoeba/talks].py for automatic download of parallel sentences datasets.
Note: See make_multilingual.py for a fully automated script that downloads the necessary data and trains the model. This script just trains the model if you have already parallel data in the right format.
Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
Usage:
python make_multilingual_sys.py train1.tsv.gz train2.tsv.gz train3.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz
For example:
python make_multilingual_sys.py parallel-sentences/talks-en-de-train.tsv.gz --dev parallel-sentences/talks-en-de-dev.tsv.gz
To load all training & dev files from a folder (Linux):
python make_multilingual_sys.py parallel-sentences/*-train.tsv.gz --dev parallel-sentences/*-dev.tsv.gz
"""
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime
import os
import logging
import gzip
import numpy as np
import sys
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
teacher_model_name = (
"paraphrase-distilroberta-base-v2" # Our monolingual teacher model, we want to convert to multiple languages
)
student_model_name = "xlm-roberta-base" # Multilingual base model we use to imitate the teacher model
max_seq_length = 128 # Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64 # Batch size for training
inference_batch_size = 64 # Batch size at inference
max_sentences_per_trainfile = 500000 # Maximum number of parallel sentences for training
train_max_sentence_length = 250 # Maximum length (characters) for parallel training sentences
num_epochs = 5 # Train for x epochs
num_warmup_steps = 10000 # Warumup steps
num_evaluation_steps = 1000 # Evaluate performance after every xxxx steps
output_path = "output/make-multilingual-sys-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Read passed arguments
train_files = []
dev_files = []
is_dev_file = False
for arg in sys.argv[1:]:
if arg.lower() == "--dev":
is_dev_file = True
else:
if not os.path.exists(arg):
print("File could not be found:", arg)
exit()
if is_dev_file:
dev_files.append(arg)
else:
train_files.append(arg)
if len(train_files) == 0:
print("Please pass at least some train files")
print("python make_multilingual_sys.py file1.tsv.gz file2.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz")
exit()
logger.info("Train files: {}".format(", ".join(train_files)))
logger.info("Dev files: {}".format(", ".join(dev_files)))
######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)
logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(
student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
)
for train_file in train_files:
train_data.load_data(
train_file, max_sentences=max_sentences_per_trainfile, max_sentence_length=train_max_sentence_length
)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)
#### Evaluate cross-lingual performance on different tasks #####
evaluators = [] # evaluators has a list of different evaluator classes we call periodically
for dev_file in dev_files:
logger.info("Create evaluator for " + dev_file)
src_sentences = []
trg_sentences = []
with gzip.open(dev_file, "rt", encoding="utf8") if dev_file.endswith(".gz") else open(
dev_file, encoding="utf8"
) as fIn:
for line in fIn:
splits = line.strip().split("\t")
if splits[0] != "" and splits[1] != "":
src_sentences.append(splits[0])
trg_sentences.append(splits[1])
# Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
dev_mse = evaluation.MSEEvaluator(
src_sentences,
trg_sentences,
name=os.path.basename(dev_file),
teacher_model=teacher_model,
batch_size=inference_batch_size,
)
evaluators.append(dev_mse)
# TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
dev_trans_acc = evaluation.TranslationEvaluator(
src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
)
evaluators.append(dev_trans_acc)
# Train the model
student_model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
epochs=num_epochs,
warmup_steps=num_warmup_steps,
evaluation_steps=num_evaluation_steps,
output_path=output_path,
save_best_model=True,
optimizer_params={"lr": 2e-5, "eps": 1e-6, "correct_bias": False},
)
# Natural Language Inference
Given two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task of deciding if the premise entails the hypothesis, if they are contradiction or if they are neutral. Commonly used NLI dataset are [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426).
[Conneau et al.](https://arxiv.org/abs/1705.02364) showed that NLI data can be quite useful when training Sentence Embedding methods. We also found this in our [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) and often use NLI as a first fine-tuning step for sentence embedding methods.
To train on NLI, see the following example files:
- **[training_nli.py](training_nli.py)** - This example uses the Softmax-Classification-Loss, as described in the [SBERT-Paper](https://arxiv.org/abs/1908.10084), to learn sentence embeddings.
- **[training_nli_v2.py](training_nli_v2.py)** - The Softmax-Classification-Loss, as used in our original SBERT paper, does not yield optimal performance. A better loss is [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), where we provide pairs or triplets. In that example, we provide a triplet of the format: (anchor, entailment_sentence, contradiction_sentence). The NLI data provides such triplets. The MultipleNegativesRankingLoss yields much higher performances and is more intuitive than the Softmax-Classification-Loss. We have used this loss to train the paraphrase model in our [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) paper.
- **[training_nli_v3.py](training_nli_v3.py)** - Following the [GISTEmbed](https://arxiv.org/abs/2402.16829) paper, we can modify the in-batch negative selection from [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) using a guiding model. Candidate negative pairs are ignored during training if the guiding model considers the pair to be too similar. In practice, the [GISTEmbedLoss](https://www.sbert.net/docs/package_reference/losses.html#gistembedloss) tends to produce a stronger training signal than `MultipleNegativesRankingLoss` at the cost of some training overhead for running inference on the guiding model.
## Data
In our experiments we combine [SNLI](https://arxiv.org/abs/1508.05326) and [MultiNLI](https://arxiv.org/abs/1704.05426), which we call AllNLI. These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:
| Sentence A (Premise) | Sentence B (Hypothesis) | Label |
| --- | --- | --- |
| A soccer game with multiple males playing. | Some men are playing a sport. | entailment |
| An older and younger man smiling. | Two men are smiling and laughing at the cats playing on the floor. | neutral |
| A man inspects the uniform of a figure in some East Asian country. | The man is sleeping. | contradiction |
## SoftmaxLoss
[Conneau et al.](https://arxiv.org/abs/1705.02364) described how a softmax classifier on top of a siamese network can be used to learn meaningful sentence representation. We can achieve this by using the [losses.SoftmaxLoss](../../../docs/package_reference/losses.html#softmaxloss) package.
The softmax loss looks like this:
![SBERT SoftmaxLoss](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png "SBERT SoftmaxLoss")
We pass the two sentences through our SentenceTransformer network and get the sentence embeddings *u* and *v*. We then concatenate u, v and |u-v| to form one, long vector. This vector is then passed to a softmax classifier, which predicts our three classes (entailment, neutral, contradiction).
This setup learns sentence embeddings, that can later be used for wide variety of tasks.
## MultipleNegativesRankingLoss
That the softmax-loss with NLI data produces (relatively) good sentence embeddings is rather coincidental. The [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) is much more intuitive and produces also significantly better sentence representations.
The training data for MultipleNegativesRankingLoss consists of sentence pairs [(a<sub>1</sub>, b<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>)] where we assume that (a<sub>i</sub>, b<sub>i</sub>) are similar sentences and (a<sub>i</sub>, b<sub>j</sub>) are dissimilar sentences for i != j. The minimizes the distance between (a<sub>i</sub>, b<sub>i</sub>) while it simultaneously maximizes the distance (a<sub>i</sub>, b<sub>j</sub>) for all i != j.
For example in the following picture:
![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MultipleNegativeRankingLoss.png)
The distance between (a<sub>1</sub>, b<sub>1</sub>) is reduced, while the distance between (a<sub>1</sub>, b<sub>2...5</sub>) will be increased. The same is done for a<sub>2</sub>, ..., a<sub>5</sub>.
Using MultipleNegativeRankingLoss with NLI is rather easy: We define sentences that have an *entailment* label as positive pairs. E.g, we have pairs like (*"A soccer game with multiple males playing."*, *"Some men are playing a sport."*) and want that these pairs are close in vector space.
### MultipleNegativesRankingLoss with Hard Negatives
We can further improve MultipleNegativesRankingLoss by not only providing pairs, but by providing triplets: [(a<sub>1</sub>, b<sub>1</sub>, c<sub>1</sub>), ..., (a<sub>n</sub>, b<sub>n</sub>, c<sub>n</sub>)]
The entry for c<sub>i</sub> are so-called hard-negatives: On a lexical level, they are similar to a<sub>i</sub> and b<sub>i</sub>. But on a semantic level, they mean different things and should not be close in the vector space.
For NLI data, we can use the contradiction-label to create such triplets with a hard negative. So our triplets look like this:
("*A soccer game with multiple males playing."*, *"Some men are playing a sport."*, *"A group of men playing a baseball game."*).
We want the sentences *"A soccer game with multiple males playing."* and *"Some men are playing a sport."* to be close in the vector space, while there should be a larger distance between *"A soccer game with multiple males playing."* and "*A group of men playing a baseball game."*.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment