Commit 0fccd232 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add

parents
Pipeline #1027 failed with stages
in 0 seconds
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with softmax loss function. At every 1000 training steps, the model is evaluated on the
STS benchmark dataset
Usage:
python training_nli.py
OR
python training_nli.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
# Read the dataset
train_batch_size = 16
model_save_path = (
"output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
label_id = label2int[row["label"]]
train_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(
model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
)
# Configure the training
num_epochs = 1
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
)
test_evaluator(model, output_path=model_save_path)
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MultipleNegativesRankingLoss. Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
Usage:
python training_nli_v2.py
OR
python training_nli_v2.py pretrained_transformer_model_name
"""
import math
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
# Save path of the model
model_save_path = (
"output/training_nli_v2_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
)
test_evaluator(model, output_path=model_save_path)
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with GISTEmbedLoss, using all-MiniLM-L6-v2 as an efficient guiding model. Entailments are positive pairs and the contradiction
on AllNLI dataset is added as a hard negative. At every 10% training steps, the model is evaluated on the STS benchmark dataset
Usage:
python training_nli_v3.py
OR
python training_nli_v3.py pretrained_transformer_model_name
"""
import math
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
# Save path of the model
model_save_path = (
"output/training_nli_v3_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# The guiding model
guide_model = SentenceTransformer("all-MiniLM-L6-v2")
# Our training loss
train_loss = losses.GISTEmbedLoss(model, guide_model)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
)
test_evaluator(model, output_path=model_save_path)
"""
This script trains sentence transformers with a batch hard loss function.
The TREC dataset will be automatically downloaded and put in the datasets/ directory
Usual triplet loss takes 3 inputs: anchor, positive, negative and optimizes the network such that
the positive sentence is closer to the anchor than the negative sentence. However, a challenge here is
to select good triplets. If the negative sentence is selected randomly, the training objective is often
too easy and the network fails to learn good representations.
Batch hard triplet loss (https://arxiv.org/abs/1703.07737) creates triplets on the fly. It requires that the
data is labeled (e.g. labels 1, 2, 3) and we assume that samples with the same label are similar:
In a batch, it checks for sent1 with label 1 what is the other sentence with label 1 that is the furthest (hard positive)
which sentence with another label is the closest (hard negative example). It then tries to optimize this, i.e.
all sentences with the same label should be close and sentences for different labels should be clearly separated.
"""
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
import logging
import os
import random
from collections import defaultdict
logging.basicConfig(
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
handlers=[LoggingHandler()],
)
# Inspired from torchnlp
def trec_dataset(
directory="datasets/trec/",
train_filename="train_5500.label",
test_filename="TREC_10.label",
validation_dataset_nb=500,
urls=[
"https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label",
"https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label",
],
):
os.makedirs(directory, exist_ok=True)
ret = []
for url, filename in zip(urls, [train_filename, test_filename]):
full_path = os.path.join(directory, filename)
if not os.path.exists(full_path):
util.http_get(url, full_path)
examples = []
label_map = {}
guid = 1
for line in open(full_path, "rb"):
# there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
label, _, text = line.replace(b"\xf0", b" ").strip().decode().partition(" ")
if label not in label_map:
label_map[label] = len(label_map)
label_id = label_map[label]
guid += 1
examples.append(InputExample(guid=guid, texts=[text], label=label_id))
ret.append(examples)
train_set, test_set = ret
dev_set = None
# Create a dev set from train set
if validation_dataset_nb > 0:
dev_set = train_set[-validation_dataset_nb:]
train_set = train_set[:-validation_dataset_nb]
# For dev & test set, we return triplets (anchor, positive, negative)
random.seed(42) # Fix seed, so that we always get the same triplets
dev_triplets = triplets_from_labeled_dataset(dev_set)
test_triplets = triplets_from_labeled_dataset(test_set)
return train_set, dev_triplets, test_triplets
def triplets_from_labeled_dataset(input_examples):
# Create triplets for a [(label, sentence), (label, sentence)...] dataset
# by using each example as an anchor and selecting randomly a
# positive instance with the same label and a negative instance with a different label
triplets = []
label2sentence = defaultdict(list)
for inp_example in input_examples:
label2sentence[inp_example.label].append(inp_example)
for inp_example in input_examples:
anchor = inp_example
if len(label2sentence[inp_example.label]) < 2: # We need at least 2 examples per label to create a triplet
continue
positive = None
while positive is None or positive.guid == anchor.guid:
positive = random.choice(label2sentence[inp_example.label])
negative = None
while negative is None or negative.label == anchor.label:
negative = random.choice(input_examples)
triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))
return triplets
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = "all-distilroberta-v1"
### Create a torch.DataLoader that passes training batch instances to our model
train_batch_size = 32
output_path = "output/finetune-batch-hard-trec-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1
logging.info("Loading TREC dataset")
train_set, dev_set, test_set = trec_dataset()
# We create a special dataset "SentenceLabelDataset" to wrap out train_set
# It will yield batches that contain at least two samples with the same label
train_data_sampler = SentenceLabelDataset(train_set)
train_dataloader = DataLoader(train_data_sampler, batch_size=32, drop_last=True)
# Load pretrained model
logging.info("Load model")
model = SentenceTransformer(model_name)
### Triplet losses ####################
### There are 4 triplet loss variants:
### - BatchHardTripletLoss
### - BatchHardSoftMarginTripletLoss
### - BatchSemiHardTripletLoss
### - BatchAllTripletLoss
#######################################
train_loss = losses.BatchAllTripletLoss(model=model)
# train_loss = losses.BatchHardTripletLoss(model=model)
# train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
# train_loss = losses.BatchSemiHardTripletLoss(model=model)
logging.info("Read TREC val dataset")
dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name="trec-dev")
logging.info("Performance before fine-tuning:")
dev_evaluator(model)
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=output_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on TREC dataset
#
##############################################################################
logging.info("Evaluating model on test set")
test_evaluator = TripletEvaluator.from_input_examples(test_set, name="trec-test")
model.evaluate(test_evaluator)
"""
This is an example how to train SentenceTransformers in a multi-task setup.
The system trains BERT on the AllNLI and on the STSbenchmark dataset.
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import gzip
import csv
import os
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Read the dataset
model_name = "bert-base-uncased"
batch_size = 16
model_save_path = "output/training_multi-task_" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "datasets/AllNLI.tsv.gz"
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_nli_samples = []
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
label_id = label2int[row["label"]]
train_nli_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=label_id))
train_dataloader_nli = DataLoader(train_nli_samples, shuffle=True, batch_size=batch_size)
train_loss_nli = losses.SoftmaxLoss(
model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)
)
logging.info("Read STSbenchmark train dataset")
train_sts_samples = []
dev_sts_samples = []
test_sts_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_sts_samples.append(inp_example)
elif row["split"] == "test":
test_sts_samples.append(inp_example)
else:
train_sts_samples.append(inp_example)
train_dataloader_sts = DataLoader(train_sts_samples, shuffle=True, batch_size=batch_size)
train_loss_sts = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_sts_samples, name="sts-dev")
# Configure the training
num_epochs = 4
warmup_steps = math.ceil(len(train_dataloader_sts) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data)
# and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data)
# You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way.
train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)]
# Train the model
model.fit(
train_objectives=train_objectives,
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_sts_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)
"""
This script trains sentence transformers with a triplet loss function.
As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
"""
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
from zipfile import ZipFile
import csv
import logging
import os
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = "distilbert-base-uncased"
dataset_path = "datasets/wikipedia-sections"
if not os.path.exists(dataset_path):
os.makedirs(dataset_path, exist_ok=True)
filepath = os.path.join(dataset_path, "wikipedia-sections-triplets.zip")
util.http_get("https://sbert.net/datasets/wikipedia-sections-triplets.zip", filepath)
with ZipFile(filepath, "r") as zip:
zip.extractall(dataset_path)
### Create a torch.DataLoader that passes training batch instances to our model
train_batch_size = 16
output_path = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1
### Configure sentence transformers for training and train on the provided dataset
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
logger.info("Read Triplet train dataset")
train_examples = []
with open(os.path.join(dataset_path, "train.csv"), encoding="utf-8") as fIn:
reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
train_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]], label=0))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)
logger.info("Read Wikipedia Triplet dev dataset")
dev_examples = []
with open(os.path.join(dataset_path, "validation.csv"), encoding="utf-8") as fIn:
reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
dev_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
if len(dev_examples) >= 1000:
break
evaluator = TripletEvaluator.from_input_examples(dev_examples, name="dev")
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=output_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
logger.info("Read test examples")
test_examples = []
with open(os.path.join(dataset_path, "test.csv"), encoding="utf-8") as fIn:
reader = csv.DictReader(fIn, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
test_examples.append(InputExample(texts=[row["Sentence1"], row["Sentence2"], row["Sentence3"]]))
model = SentenceTransformer(output_path)
test_evaluator = TripletEvaluator.from_input_examples(test_examples, name="test")
test_evaluator(model, output_path=output_path)
import math
import logging
import random
class MultiDatasetDataLoader:
def __init__(self, datasets, batch_size_pairs, batch_size_triplets=None, dataset_size_temp=-1):
self.allow_swap = True
self.batch_size_pairs = batch_size_pairs
self.batch_size_triplets = batch_size_pairs if batch_size_triplets is None else batch_size_triplets
# Compute dataset weights
self.dataset_lengths = list(map(len, datasets))
self.dataset_lengths_sum = sum(self.dataset_lengths)
weights = []
if dataset_size_temp > 0: # Scale probability with dataset size
for dataset in datasets:
prob = len(dataset) / self.dataset_lengths_sum
weights.append(max(1, int(math.pow(prob, 1 / dataset_size_temp) * 1000)))
else: # Equal weighting of all datasets
weights = [100] * len(datasets)
logging.info("Dataset lengths and weights: {}".format(list(zip(self.dataset_lengths, weights))))
self.dataset_idx = []
self.dataset_idx_pointer = 0
for idx, weight in enumerate(weights):
self.dataset_idx.extend([idx] * weight)
random.shuffle(self.dataset_idx)
self.datasets = []
for dataset in datasets:
random.shuffle(dataset)
self.datasets.append(
{
"elements": dataset,
"pointer": 0,
}
)
def __iter__(self):
for _ in range(int(self.__len__())):
# Select dataset
if self.dataset_idx_pointer >= len(self.dataset_idx):
self.dataset_idx_pointer = 0
random.shuffle(self.dataset_idx)
dataset_idx = self.dataset_idx[self.dataset_idx_pointer]
self.dataset_idx_pointer += 1
# Select batch from this dataset
dataset = self.datasets[dataset_idx]
batch_size = self.batch_size_pairs if len(dataset["elements"][0].texts) == 2 else self.batch_size_triplets
batch = []
texts_in_batch = set()
guid_in_batch = set()
while len(batch) < batch_size:
example = dataset["elements"][dataset["pointer"]]
valid_example = True
# First check if one of the texts in already in the batch
for text in example.texts:
text_norm = text.strip().lower()
if text_norm in texts_in_batch:
valid_example = False
texts_in_batch.add(text_norm)
# If the example has a guid, check if guid is in batch
if example.guid is not None:
valid_example = valid_example and example.guid not in guid_in_batch
guid_in_batch.add(example.guid)
if valid_example:
if self.allow_swap and random.random() > 0.5:
example.texts[0], example.texts[1] = example.texts[1], example.texts[0]
batch.append(example)
dataset["pointer"] += 1
if dataset["pointer"] >= len(dataset["elements"]):
dataset["pointer"] = 0
random.shuffle(dataset["elements"])
yield self.collate_fn(batch) if self.collate_fn is not None else batch
def __len__(self):
return int(self.dataset_lengths_sum / self.batch_size_pairs)
# Paraphrase Data
**This page is currently work-in-progress and will be extended in the future**
In our paper [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813) we showed that paraphrase dataset together with [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) is a powerful combination to learn sentence embeddings models.
You can find here: [NLI - MultipleNegativesRankingLoss](https://www.sbert.net/examples/training/nli/README.html#multiplenegativesrankingloss) more information how the loss can be used.
In this folder, we collect different datasets and scripts to train using paraphrase data.
## Datasets
You can find here: [sbert.net/datasets/paraphrases](http://sbert.net/datasets/paraphrases) a list of datasets with paraphrases suitable for training.
| Name | Source | #Sentence-Pairs | STSb-dev |
| --- | --- | :---: | :---: |
| [AllNLI.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/AllNLI.tsv.gz) | [SNLI](https://nlp.stanford.edu/projects/snli/) + [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | 277,230 | 86.54 |
| [sentence-compression.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/sentence-compression.tsv.gz) | [sentence-compression](https://github.com/google-research-datasets/sentence-compression) | 180,000 | 84.36 |
| [SimpleWiki.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/SimpleWiki.tsv.gz) | [SimpleWiki](https://cs.pomona.edu/~dkauchak/simplification/) | 102,225 | 84.26 |
| [altlex.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/altlex.tsv.gz) | [altlex](https://github.com/chridey/altlex/) | 112,696 | 83.34 |
| [msmarco-triplets.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/msmarco-triplets.tsv.gz) | [MS MARCO Passages](https://microsoft.github.io/msmarco/) | 5,028,051 | 83.12 |
| [quora_duplicates.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/quora_duplicates.tsv.gz) | [Quora](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | 103,663 | 82.55 |
| [coco_captions-with-guid.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/coco_captions-with-guid.tsv.gz) | [COCO](https://cocodataset.org/) | 828,395 | 82.25
| [flickr30k_captions-with-guid.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/flickr30k_captions-with-guid.tsv.gz) | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | 317,695 | 82.04
| [yahoo_answers_title_question.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/yahoo_answers_title_question.tsv.gz) | [Yahoo Answers Dataset](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) | 659,896 | 81.19 |
| [S2ORC_citation_pairs.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/S2ORC_citation_pairs.tsv.gz) | [Semantic Scholar Open Research Corpus](http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/) | 52,603,982 | 81.02 |
| [yahoo_answers_title_answer.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/yahoo_answerstitle_answer.tsv.gz) | [Yahoo Answers Dataset](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) | 1,198,260 | 80.25
| [stackexchange_duplicate_questions.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/stackexchange_duplicate_questions.tsv.gz) | [Stackexchange](https://stackexchange.com/) | 169,438 | 80.37
| [yahoo_answers_question_answer.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/yahoo_answers_question_answer.tsv.gz) | [Yahoo Answers Dataset](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) | 681,164 | 79.88 |
| [wiki-atomic-edits.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/wiki-atomic-edits.tsv.gz) | [wiki-atomic-edits](https://github.com/google-research-datasets/wiki-atomic-edits) | 22,980,185 | 79.58
| [wiki-split.tsv.gz](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/paraphrases/wiki-split.tsv.gz) | [wiki-split](https://github.com/google-research-datasets/wiki-split) | 929,944 | 76.59
See the respective linked source website for the dataset license.
All datasets have a sample per line and the individual sentences are separated by a tab (\t). Some datasets (like AllNLI) has three sentences per line: An anchor, a positive, and a hard negative.
We measure for each dataset the performance on the STSb development dataset after 2k training steps with a distilroberta-base model and a batch size of 256.
**Note**: We find that the STSb dataset is a suboptimal dataset to evaluate the quality of sentence embedding models. It consists mainly of rather simple sentences, it does not require any domain specific knowledge, and the included sentences are of rather high quality compared to noisy, user-written content. Please do not infer from the above numbers how the approaches will perform on your domain specific dataset.
## Training
See [training.py](training.py) for the training script.
The training script allows to load one or multiple files. We construct batches by sampling examples from the respective dataset. So far, examples are not mixed between the datasets, i.e., a batch consists only of examples from a single dataset.
As the dataset sizes are quite different in size, we perform a temperature controlled sampling from the datasets: Smaller datasets are up-sampled, while larger datasets are down-sampled. This allows an effective training with very large and smaller datasets.
## Pre-Trained Models
Have a look at [pre-trained models](https://www.sbert.net/docs/pretrained_models.html) to view all models that were trained on these paraphrase datasets.
- **paraphrase-MiniLM-L12-v2** - Trained on the following datasets: AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits
- **paraphrase-distilroberta-base-v2** - Trained on the following datasets: AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits
- **paraphrase-distilroberta-base-v1** - Trained on the following datasets: AllNLI, sentence-compression, SimpleWiki, altlex, quora_duplicates, wiki-atomic-edits, wiki-split
- **paraphrase-xlm-r-multilingual-v1** - Multilingual version of paraphrase-distilroberta-base-v1, trained on parallel data for 50+ languages. (Teacher: paraphrase-distilroberta-base-v1, Student: xlm-r-base)
## Work in Progress
Training with this data is currently work-in-progress. Things that will be added in the next time:
- **More datasets**: Are you aware of more suitable training datasets? Let me know: [info@nils-reimers.de](mailto:info@nils-reimers.de)
- **Optimized batching**: Currently batches are only drawn from one dataset. Future work might include also batches that are sampled across datasets
- **Optimized loss function**: Currently the same parameters of MultipleNegativesRankingLoss is used for all datasets. Future work includes testing if the dataset benefit from individual loss functions.
- **Pre-trained models**: Once all datasets are collected, we will train and release respective models.
\ No newline at end of file
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
from MultiDatasetDataLoader import MultiDatasetDataLoader
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = "distilroberta-base"
num_epochs = 1
sts_dataset_path = "data-eval/stsbenchmark.tsv.gz"
batch_size_pairs = 384
batch_size_triplets = 256
max_seq_length = 128
use_amp = True # Set to False, if you use a CPU or your GPU does not support FP16 operations
evaluation_steps = 500
warmup_steps = 500
#####
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Save path of the model
model_save_path = (
"output/training_paraphrases_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
## SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
dataset_list = []
for filepath in sys.argv[1:]:
dataset = []
with_guid = "with-guid" in filepath # Some datasets have a guid in the first column
with gzip.open(filepath, "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
if with_guid:
guid = splits[0]
texts = splits[1:]
else:
guid = None
texts = splits
dataset.append(InputExample(texts=texts, guid=guid))
dataset_list.append(dataset)
train_dataloader = MultiDatasetDataLoader(
dataset_list, batch_size_pairs=batch_size_pairs, batch_size_triplets=batch_size_triplets
)
# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "dev":
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=evaluation_steps,
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=use_amp,
checkpoint_path=model_save_path,
checkpoint_save_steps=1000,
checkpoint_save_total_limit=3,
)
# Quora Duplicate Questions
This folder contains scripts that demonstrate how to train SentenceTransformers for **Information Retrieval**. As simple example, we will use the [Quora Duplicate Questions dataset](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs). It contains over 500,000 sentences with over 400,000 pairwise annotations whether two questions are a duplicate or not.
## Pretrained Models
Currently the following models trained on Quora Duplicate Questions are available:
* **distilbert-base-nli-stsb-quora-ranking**: We extended the *distilbert-base-nli-stsb-mean-tokens* model and trained it with *OnlineContrastiveLoss* and with *MultipleNegativesRankingLoss* on the Quora Duplicate questions dataset. For the code, see [training_multi-task-learning.py](training_multi-task-learning.py)
* **distilbert-multilingual-nli-stsb-quora-ranking**: Extension of *distilbert-base-nli-stsb-quora-ranking* to be multi-lingual. Trained on parallel data for 50 languages.
You can load & use pre-trained models like this:
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("model_name")
```
## Dataset
As dataset to train a **Duplicate Questions Semantic Search Engine** we use [Quora Duplicate Questions dataset](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs). The original format looks like this:
```
id qid1 qid2 question1 question2 is_duplicate
0 1 2 What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market? 0
1 3 4 What is the story of Kohinoor (Koh-i-Noor) Diamond? What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back? 0
```
As a first step, we process this file to create distinct train/dev/test splits for different tasks. We define the following tasks:
- **Duplicate Questions Classification**: Given two questions, are these questions duplicates? This is the original task as defined by Quora, however, it is rather a unpractical task. How do we retrieve possible duplicates in a large corpus for a given question? Further, models performing well on this classification task do not necessarily perform well on the following two task.
- **Duplicate Questions Mining**: Given a large set (like 100k) of questions, identify all question pairs that are duplicates.
- **Duplicate Questions Information Retrieval**: Given a large corpus (350k+) of questions. For a new, unseen question, find the most related (i.e. duplicate) questions in this corpus.
**Download**: You can download the finished dataset here: [quora-IR-dataset.zip](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip)
For details on the creation of the dataset, see [create_splits.py](create_splits.py).
## Usage
### Duplicate Questions Mining
Given a large set of sentences (in this case questions), identify all pairs that are duplicates. See [Paraphrase Mining](../../applications/paraphrase-mining/README.md) for an example how to use sentence transformers to mine for duplicate questions / paraphrases. This approach can be scaled to hundred thousands of sentences given you have enough memory.
### Semantic Search
The model can also be used for Information Retrieval / Semantic Search. Given a new question, search a large corpus of hundred thousands of questions for duplicate questions. Given you have enough memory, this approach works well to copora up in the Millions (depending on your real-time requirements).
For an interactive example, see [Semantic Search](../../applications/semantic-search/README.md).
## Training
Choosing the right loss function is crucial for getting well working sentence embeddings. For the given task, two loss functions are especially suitable: **ConstrativeLoss** and **MultipleNegativesRankingLoss**
### Constrative Loss
For the complete example, see [training_OnlineContrastiveLoss.py](training_OnlineContrastiveLoss.py).
In the original dataset, we have questions given with a label of 0=not duplicate and 1=duplicate. In that case, we can use contrastive loss: Similar pairs with label 1 are pulled together, so that they are close in vector space. Dissimilar pairs, that are closer than a defined margin, are pushed away in vector space.
Choosing the distance function and especially choosing a sensible margin are quite important for the success of contrastive loss. In the given example, we use cosine_distance (which is 1-cosine_similarity) with a margin of 0.5. I.e., non-duplicate questions should have a cosine_distance of at least 0.5 (which is equivalent to a 0.5 cosine similarity difference).
An improved version of contrastive loss is OnlineContrastiveLoss, which looks which negative pairs have a lower distance that the largest positive pair and which positive pairs have a higher distance than the lowest distance of negative pairs. I.e., this loss automatically detects the hard cases in a batch and computes the loss only for these cases.
The loss can be used like this:
```python
train_samples = []
with open(
os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8"
) as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
sample = InputExample(
texts=[row["question1"], row["question2"]],
label=int(row["is_duplicate"]),
)
train_samples.append(sample)
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
```
For each row in our train dataset, we create new InputExample objects and the two questions as texts and the is_duplicate as the label.
## MultipleNegativesRankingLoss
For the complete example, see [training_MultipleNegativesRankingLoss.py](training_MultipleNegativesRankingLoss.py).
*MultipleNegativesRankingLoss* is especially suitable for Information Retrieval / Semantic Search. A nice advantage of *MultipleNegativesRankingLoss* is that it only requires positive pairs, i.e., we only need examples of duplicate questions.
From all pairs, we sample a mini-batch *(a_1, b_1), ..., (a_n, b_n)* where *(a_i, b_i)* is a duplicate question.
MultipleNegativesRankingLoss now uses all *b_j* with j != i as negative example for *(a_i, b_i)*. For example, for *a_1* we have given the options *(b_1, ..., b_n)* and we need to identify which is the correct duplicate question to *a_1*. We do this by computing the dot-product between the embedding of *a_1* and all *b*'s and softmax normalize it so that we get a probability distribution over *(b_1, ..., b_n)*. In the best case, the positive example *b_1* get a probability of close to 1 while all others get scores close to 0. We use negative log-likelihood to compute the loss.
*MultipleNegativesRankingLoss* implements this idea in an efficient way so that the embeddings are re-used. With a batch-size of 64, we have 64 positive pairs and each positive pairs has 64-1 negative distractors.
Using the loss is easy and does not require tuning of any hyperparameters:
```python
train_samples = []
with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["is_duplicate"] == "1":
train_samples.append(
InputExample(texts=[row["question1"], row["question2"]], label=1)
)
train_samples.append(
InputExample(texts=[row["question2"], row["question1"]], label=1)
) # if A is a duplicate of B, then B is a duplicate of A
# After reading the train_samples, we create a SentencesDataset and a DataLoader
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)
```
We only use the positive examples. As 'is_duplicate' is a symmetric relation, we not only add (A, B) but also (B, A) to our training sample set.
**Note 1:** Increasing the batch sizes usually yields better results, as the task gets harder. It is more difficult to identify the correct duplicate question out of a set of 100 questions than out of a set of only 10 questions. So it is advisable to set the training batch size as large as possible. I trained it with a batch size of 350 on 32 GB GPU memory.
**Note 2:** MultipleNegativesRankingLoss only works if *(a_i, b_j)* with j != i is actually a negative, non-duplicate question pair. In few instances, this assumption is wrong. But in the majority of cases, if we sample two random questions, they are not duplicates. If your dataset cannot fulfil this property, MultipleNegativesRankingLoss might not work well.
### Multi-Task-Learning
Contrastive Loss works well for pair classification, i.e., given two pairs, are these duplicates or not. It pushes negative pairs far away in vector space, so that the distinguishing between duplicate and non-duplicate pairs works good.
MultipleNegativesRankingLoss on the other sides mainly reduces the distance between positive pairs out of large set of possible candidates. However, the distance between non-duplicate questions is not so large, so that this loss does not work that well for pair classification.
In [training_multi-task-learning.py](training_multi-task-learning.py) I demonstrate how we can train the network with both losses. The essential code is to define both losses and to pass it to the fit method.
```python
train_samples_MultipleNegativesRankingLoss = []
train_samples_ContrastiveLoss = []
with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
train_samples_ContrastiveLoss.append(
InputExample(
texts=[row["question1"], row["question2"]],
label=int(row["is_duplicate"]),
)
)
if row["is_duplicate"] == "1":
train_samples_MultipleNegativesRankingLoss.append(
InputExample(texts=[row["question1"], row["question2"]], label=1)
)
train_samples_MultipleNegativesRankingLoss.append(
InputExample(texts=[row["question2"], row["question1"]], label=1)
) # if A is a duplicate of B, then B is a duplicate of A
# Create data loader and loss for MultipleNegativesRankingLoss
train_dataset_MultipleNegativesRankingLoss = SentencesDataset(
train_samples_MultipleNegativesRankingLoss, model=model
)
train_dataloader_MultipleNegativesRankingLoss = DataLoader(
train_dataset_MultipleNegativesRankingLoss,
shuffle=True,
batch_size=train_batch_size,
)
train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)
# Create data loader and loss for OnlineContrastiveLoss
train_dataset_ConstrativeLoss = SentencesDataset(
train_samples_ConstrativeLoss, model=model
)
train_dataloader_ConstrativeLoss = DataLoader(
train_dataset_ConstrativeLoss, shuffle=True, batch_size=train_batch_size
)
train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(
model=model, distance_metric=distance_metric, margin=margin
)
# .....
# Train the model
model.fit(
train_objectives=[
(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss),
(train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss),
],
evaluator=seq_evaluator,
epochs=num_epochs,
warmup_steps=1000,
output_path=model_save_path,
)
```
"""
This application demonstrates how to find duplicate questions (paraphrases) in a long
list of sentences.
"""
from sentence_transformers import SentenceTransformer, util
# Questions can be a long list of sentences up to 100k sentences or more.
# For demonstration purposes, we limit it to a few questions which all have on duplicate
questions = [
"How did you catch your spouse cheating?",
"How can I find out if my husband is cheating?",
"Is my wife cheating?",
"How do I know if my partner is cheating?",
"Why is Starbucks in India overrated?",
"Is Starbucks overrated in india?",
"How can I lose weight fast without exercise?",
"Can I lose weight without exercise?",
"Which city is the best in India? Why?",
"Which is the best city in India?",
"How can I stay focused in class?",
"How can I stay focused on my school work?",
"How can I Remotely hack a mobile phone?",
"How can I hack my phone?",
"Where should I stay in Goa?",
"Which are the best hotels in Goa?",
"Why does hair turn white?",
"What causes older peoples hair to turn grey?",
"What is the easiest way to get followers on Quora?",
"How do I get more followers for my Quora?",
]
model = SentenceTransformer("all-MiniLM-L6-v2")
# Given a model and a List of strings (texts), evaluation.ParaphraseMiningEvaluator.paraphrase_mining performs a
# mining task by computing cosine similarity between all possible combinations and returning the ones with the highest scores.
# It returns a list of tuples (score, i, j) with i, j representing the index in the questions list.
pairs = util.paraphrase_mining(model, questions)
# Output Top-20 pairs:
for score, qid1, qid2 in pairs[0:20]:
print("{:.3f}\t{}\t\t\t{}".format(score, questions[qid1], questions[qid2]))
"""
The Quora Duplicate Questions dataset contains questions pairs from Quora (www.quora.com)
along with a label whether the two questions are a duplicate, i.e., have an identical intention.
Example of a duplicate pair:
How do I enhance my English? AND How can I become good at English?
Example of a non-duplicate pair:
How are roads named? AND How are airport runways named?
More details and the original Quora dataset can be found here:
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
Dataset: http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
You do not need to run this script. You can download all files from here:
https://sbert.net/datasets/quora-duplicate-questions.zip
This script does the following:
1) After reading the quora_duplicate_questions.tsv, as provided by Quora, we add a transitive closure: If question (A, B) are duplicates and (B, C) are duplicates, than (A, C) must also be a duplicate. We add these missing links.
2) Next, we split sentences into train, dev, and test with a ratio of about 85% / 5% / 10%. In contrast to must other Quora data splits, like the split provided by GLUE, we ensure that the three sets are overlap free, i.e., no sentences in dev / test will appear in the train dataset. In order to achieve three distinct datasets, we pick a sentence and then assign all duplicate sentences to this dataset to that respective set
3) After distributing sentences to the three dataset split, we create files to facilitate 3 different tasks:
3.1) Classification - Given two sentences, are these a duplicate? This is identical to the original Quora task and the task in GLUE, but with the big difference that sentences in dev / test have not been seen in train.
3.2) Duplicate Question Mining - Given a large set of questions, identify all duplicates. The dev set consists of about 50k questions, the test set of about 100k sentences.
3.3) Information Retrieval - Given a question as query, find in a large corpus (~350k questions) the duplicates of the query question.
The output consists of the following files:
quora_duplicate_questions.tsv - Original file provided by Quora (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
classification/
train/dev/test_pairs.tsv - Distinct sets of question pairs with label for duplicate / non-duplicate. These splits can be used for sentence pair classification tasks
duplicate-mining/ - Given a large set of questions, find all duplicates.
_corpus.tsv - Large set of sentences
_duplicates.tsv - All duplicate questions in the respective corpus.tsv
information-retrieval/ - Given a large corpus of questions, find the duplicates for a given query
corpus.tsv - This file will be used for train/dev/test. It contains all questions in the corpus
dev/test-queries.tsv - Queries and the respective duplicate questions (QIDs) in the corpus
"""
import csv
from collections import defaultdict
import random
import os
from sentence_transformers import util
random.seed(42)
# Get raw file
source_file = "quora-IR-dataset/quora_duplicate_questions.tsv"
os.makedirs("quora-IR-dataset", exist_ok=True)
os.makedirs("quora-IR-dataset/graph", exist_ok=True)
os.makedirs("quora-IR-dataset/information-retrieval", exist_ok=True)
os.makedirs("quora-IR-dataset/classification", exist_ok=True)
os.makedirs("quora-IR-dataset/duplicate-mining", exist_ok=True)
if not os.path.exists(source_file):
print("Download file to", source_file)
util.http_get("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", source_file)
# Read pairwise file
sentences = {}
duplicates = defaultdict(lambda: defaultdict(bool))
rows = []
with open(source_file, encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
for row in reader:
id1 = row["qid1"]
id2 = row["qid2"]
question1 = row["question1"].replace("\r", "").replace("\n", " ").replace("\t", " ")
question2 = row["question2"].replace("\r", "").replace("\n", " ").replace("\t", " ")
is_duplicate = row["is_duplicate"]
if question1 == "" or question2 == "":
continue
sentences[id1] = question1
sentences[id2] = question2
rows.append(
{"qid1": id1, "qid2": id2, "question1": question1, "question2": question2, "is_duplicate": is_duplicate}
)
if is_duplicate == "1":
duplicates[id1][id2] = True
duplicates[id2][id1] = True
# Search for (near) exact duplicates
# The original Quora duplicate questions dataset is an incomplete annotation,
# i.e., there are several duplicate question pairs which are not marked as duplicates.
# These missing annotation can make it difficult to compare approaches.
# Here we use a simple approach that searches for near identical questions, that only differ in maybe a stopword
# We mark these found question pairs also as duplicate to increase the annotation coverage
stopwords = set(
[
"a",
"about",
"above",
"after",
"again",
"against",
"ain",
"all",
"am",
"an",
"and",
"any",
"are",
"aren",
"aren't",
"as",
"at",
"be",
"because",
"been",
"before",
"being",
"below",
"between",
"both",
"but",
"by",
"can",
"couldn",
"couldn't",
"d",
"did",
"didn",
"didn't",
"do",
"does",
"doesn",
"doesn't",
"doing",
"don",
"don't",
"down",
"during",
"each",
"few",
"for",
"from",
"further",
"had",
"hadn",
"hadn't",
"has",
"hasn",
"hasn't",
"have",
"haven",
"haven't",
"having",
"he",
"her",
"here",
"hers",
"herself",
"him",
"himself",
"his",
"i",
"if",
"in",
"into",
"is",
"isn",
"isn't",
"it's",
"its",
"itself",
"just",
"ll",
"m",
"ma",
"me",
"mightn",
"mightn't",
"more",
"most",
"mustn",
"mustn't",
"my",
"myself",
"needn",
"needn't",
"no",
"nor",
"not",
"now",
"o",
"of",
"off",
"on",
"once",
"only",
"or",
"other",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"re",
"s",
"same",
"shan",
"shan't",
"she",
"she's",
"should",
"should've",
"shouldn",
"shouldn't",
"so",
"some",
"such",
"t",
"than",
"that",
"that'll",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"there",
"these",
"they",
"this",
"those",
"through",
"to",
"too",
"under",
"until",
"up",
"ve",
"very",
"was",
"wasn",
"wasn't",
"we",
"were",
"weren",
"weren't",
"which",
"while",
"will",
"with",
"won",
"won't",
"wouldn",
"wouldn't",
"y",
"you",
"you'd",
"you'll",
"you're",
"you've",
"your",
"yours",
"yourself",
"yourselves",
]
)
num_new_duplicates = 0
sentences_norm = {}
for id, sent in sentences.items():
sent_norm = sent.lower()
# Replace some common paraphrases
sent_norm = sent_norm.replace("how do you", "how do i").replace("how do we", "how do i")
sent_norm = (
sent_norm.replace("how can we", "how can i")
.replace("how can you", "how can i")
.replace("how can i", "how do i")
)
sent_norm = sent_norm.replace("really true", "true")
sent_norm = sent_norm.replace("what are the importance", "what is the importance")
sent_norm = sent_norm.replace("what was", "what is")
sent_norm = sent_norm.replace("so many", "many")
sent_norm = sent_norm.replace("would it take", "will it take")
# Remove any punctuation characters
for c in [",", "!", ".", "?", "'", '"', ":", ";", "[", "]", "{", "}", "<", ">"]:
sent_norm = sent_norm.replace(c, " ")
# Remove stop words
tokens = sent_norm.split()
tokens = [token for token in tokens if token not in stopwords]
sent_norm = "".join(tokens)
if sent_norm in sentences_norm:
if not duplicates[id][sentences_norm[sent_norm]]:
num_new_duplicates += 1
duplicates[id][sentences_norm[sent_norm]] = True
duplicates[sentences_norm[sent_norm]][id] = True
else:
sentences_norm[sent_norm] = id
print("(Nearly) exact duplicates found:", num_new_duplicates)
# Add transitive closure (if a,b and b,c duplicates => a,c are duplicates)
new_entries = True
while new_entries:
print("Add transitive closure")
new_entries = False
for a in sentences:
for b in list(duplicates[a]):
for c in list(duplicates[b]):
if a != c and not duplicates[a][c]:
new_entries = True
duplicates[a][c] = True
duplicates[c][a] = True
# Distribute rows to train/dev/test split
# Ensure that sets contain distinct sentences
is_assigned = set()
random.shuffle(rows)
train_ids = set()
dev_ids = set()
test_ids = set()
counter = 0
for row in rows:
if row["qid1"] in is_assigned and row["qid2"] in is_assigned:
continue
elif row["qid1"] in is_assigned or row["qid2"] in is_assigned:
if row["qid2"] in is_assigned: # Ensure that qid1 is assigned and qid2 not yet
row["qid1"], row["qid2"] = row["qid2"], row["qid1"]
# Move qid2 to the same split as qid1
target_set = train_ids
if row["qid1"] in dev_ids:
target_set = dev_ids
elif row["qid1"] in test_ids:
target_set = test_ids
else:
# Distribution about 85%/5%/10%
target_set = train_ids
if counter % 10 == 0:
target_set = dev_ids
elif counter % 10 == 1 or counter % 10 == 2:
target_set = test_ids
counter += 1
# Get the sentence with all duplicates and add it to the respective sets
target_set.add(row["qid1"])
is_assigned.add(row["qid1"])
target_set.add(row["qid2"])
is_assigned.add(row["qid2"])
for b in list(duplicates[row["qid1"]]) + list(duplicates[row["qid2"]]):
target_set.add(b)
is_assigned.add(b)
# Assert all sets are mutually exclusive
assert len(train_ids.intersection(dev_ids)) == 0
assert len(train_ids.intersection(test_ids)) == 0
assert len(test_ids.intersection(dev_ids)) == 0
print("\nTrain sentences:", len(train_ids))
print("Dev sentences:", len(dev_ids))
print("Test sentences:", len(test_ids))
# Extract the ids for duplicate questions for train/dev/test
def get_duplicate_set(ids_set):
dups_set = set()
for a in ids_set:
for b in duplicates[a]:
ids = sorted([a, b])
dups_set.add(tuple(ids))
return dups_set
train_duplicates = get_duplicate_set(train_ids)
dev_duplicates = get_duplicate_set(dev_ids)
test_duplicates = get_duplicate_set(test_ids)
print("\nTrain duplicates", len(train_duplicates))
print("Dev duplicates", len(dev_duplicates))
print("Test duplicates", len(test_duplicates))
############### Write general files about the duplicate questions graph ############
with open("quora-IR-dataset/graph/sentences.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid\tquestion\n")
for id, question in sentences.items():
fOut.write("{}\t{}\n".format(id, question))
duplicates_list = set()
for a in duplicates:
for b in duplicates[a]:
duplicates_list.add(tuple(sorted([int(a), int(b)])))
duplicates_list = list(duplicates_list)
duplicates_list = sorted(duplicates_list, key=lambda x: x[0] * 1000000 + x[1])
print("\nWrite duplicate graph in pairwise format")
with open("quora-IR-dataset/graph/duplicates-graph-pairwise.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid1\tqid2\n")
for a, b in duplicates_list:
fOut.write("{}\t{}\n".format(a, b))
print("Write duplicate graph in list format")
with open("quora-IR-dataset/graph/duplicates-graph-list.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid1\tqid2\n")
for a in sorted(duplicates.keys(), key=lambda x: int(x)):
if len(duplicates[a]) > 0:
fOut.write("{}\t{}\n".format(a, ",".join(sorted(duplicates[a]))))
print("Write duplicate graph in connected subgraph format")
with open("quora-IR-dataset/graph/duplicates-graph-connected-nodes.tsv", "w", encoding="utf8") as fOut:
written_qids = set()
fOut.write("qids\n")
for a in sorted(duplicates.keys(), key=lambda x: int(x)):
if a not in written_qids:
ids = set()
ids.add(a)
for b in duplicates[a]:
ids.add(b)
fOut.write("{}\n".format(",".join(sorted(ids, key=lambda x: int(x)))))
for id in ids:
written_qids.add(id)
def write_qids(name, ids_list):
with open("quora-IR-dataset/graph/" + name + "-questions.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid\n")
fOut.write("\n".join(sorted(ids_list, key=lambda x: int(x))))
write_qids("train", train_ids)
write_qids("dev", dev_ids)
write_qids("test", test_ids)
####### Output for duplicate mining #######
def write_mining_files(name, ids, dups):
with open("quora-IR-dataset/duplicate-mining/" + name + "_corpus.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid\tquestion\n")
for id in ids:
fOut.write("{}\t{}\n".format(id, sentences[id]))
with open("quora-IR-dataset/duplicate-mining/" + name + "_duplicates.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid1\tqid2\n")
for a, b in dups:
fOut.write("{}\t{}\n".format(a, b))
write_mining_files("train", train_ids, train_duplicates)
write_mining_files("dev", dev_ids, dev_duplicates)
write_mining_files("test", test_ids, test_duplicates)
###### Classification dataset #####
with open("quora-IR-dataset/classification/train_pairs.tsv", "w", encoding="utf8") as fOutTrain, open(
"quora-IR-dataset/classification/dev_pairs.tsv", "w", encoding="utf8"
) as fOutDev, open("quora-IR-dataset/classification/test_pairs.tsv", "w", encoding="utf8") as fOutTest:
fOutTrain.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
fOutDev.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
fOutTest.write("\t".join(["qid1", "qid2", "question1", "question2", "is_duplicate"]) + "\n")
for row in rows:
id1 = row["qid1"]
id2 = row["qid2"]
target = None
if id1 in train_ids and id2 in train_ids:
target = fOutTrain
elif id1 in dev_ids and id2 in dev_ids:
target = fOutDev
elif id1 in test_ids and id2 in test_ids:
target = fOutTest
if target is not None:
target.write("\t".join([row["qid1"], row["qid2"], sentences[id1], sentences[id2], row["is_duplicate"]]))
target.write("\n")
####### Write files for Information Retrieval #####
num_dev_queries = 5000
num_test_queries = 10000
corpus_ids = train_ids.copy()
dev_queries = set()
test_queries = set()
# Create dev queries
rnd_dev_ids = sorted(list(dev_ids))
random.shuffle(rnd_dev_ids)
for a in rnd_dev_ids:
if a not in corpus_ids:
if len(dev_queries) < num_dev_queries and len(duplicates[a]) > 0:
dev_queries.add(a)
else:
corpus_ids.add(a)
for b in duplicates[a]:
if b not in dev_queries:
corpus_ids.add(b)
# Create test queries
rnd_test_ids = sorted(list(test_ids))
random.shuffle(rnd_test_ids)
for a in rnd_test_ids:
if a not in corpus_ids:
if len(test_queries) < num_test_queries and len(duplicates[a]) > 0:
test_queries.add(a)
else:
corpus_ids.add(a)
for b in duplicates[a]:
if b not in test_queries:
corpus_ids.add(b)
# Write output for information-retrieval
print("\nInformation Retrieval Setup")
print("Corpus size:", len(corpus_ids))
print("Dev queries:", len(dev_queries))
print("Test queries:", len(test_queries))
with open("quora-IR-dataset/information-retrieval/corpus.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid\tquestion\n")
for id in sorted(corpus_ids, key=lambda id: int(id)):
fOut.write("{}\t{}\n".format(id, sentences[id]))
with open("quora-IR-dataset/information-retrieval/dev-queries.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid\tquestion\tduplicate_qids\n")
for id in sorted(dev_queries, key=lambda id: int(id)):
fOut.write("{}\t{}\t{}\n".format(id, sentences[id], ",".join(duplicates[id])))
with open("quora-IR-dataset/information-retrieval/test-queries.tsv", "w", encoding="utf8") as fOut:
fOut.write("qid\tquestion\tduplicate_qids\n")
for id in sorted(test_queries, key=lambda id: int(id)):
fOut.write("{}\t{}\t{}\n".format(id, sentences[id], ",".join(duplicates[id])))
print("--DONE--")
"""
This scripts demonstrates how to train a sentence embedding model for Information Retrieval.
As dataset, we use Quora Duplicates Questions, where we have pairs of duplicate questions.
As loss function, we use MultipleNegativesRankingLoss. Here, we only need positive pairs, i.e., pairs of sentences/texts that are considered to be relevant. Our dataset looks like this (a_1, b_1), (a_2, b_2), ... with a_i / b_i a text and (a_i, b_i) are relevant (e.g. are duplicates).
MultipleNegativesRankingLoss takes a random subset of these, for example (a_1, b_1), ..., (a_n, b_n). a_i and b_i are considered to be relevant and should be close in vector space. All other b_j (for i != j) are negative examples and the distance between a_i and b_j should be maximized. Note: MultipleNegativesRankingLoss only works if a random b_j is likely not to be relevant for a_i. This is the case for our duplicate questions dataset: If a sample randomly b_j, it is unlikely to be a duplicate of a_i.
The model we get works well for duplicate questions mining and for duplicate questions information retrieval. For question pair classification, other losses (like OnlineConstrativeLoss) work better.
"""
from torch.utils.data import DataLoader
from sentence_transformers import losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import csv
import os
from zipfile import ZipFile
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
#### /print debug information to stdout
# As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
model = SentenceTransformer("stsb-distilbert-base")
# Training for multiple epochs can be beneficial, as in each epoch a mini-batch is sampled differently
# hence, we get different negatives for each positive
num_epochs = 10
# Increasing the batch size improves the performance for MultipleNegativesRankingLoss. Choose it as large as possible
# I achieved the good results with a batch size of 300-350 (requires about 30 GB of GPU memory)
train_batch_size = 64
dataset_path = "quora-IR-dataset"
model_save_path = "output/training_MultipleNegativesRankingLoss-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(model_save_path, exist_ok=True)
# Check if the dataset exists. If not, download and extract
if not os.path.exists(dataset_path):
logger.info("Dataset not found. Download")
zip_save_path = "quora-IR-dataset.zip"
util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
with ZipFile(zip_save_path, "r") as zip:
zip.extractall(dataset_path)
######### Read train data ##########
train_samples = []
with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["is_duplicate"] == "1":
train_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=1))
train_samples.append(
InputExample(texts=[row["question2"], row["question1"]], label=1)
) # if A is a duplicate of B, then B is a duplicate of A
# After reading the train_samples, we create a DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)
################### Development Evaluators ##################
# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
evaluators = []
###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_sentences1.append(row["question1"])
dev_sentences2.append(row["question2"])
dev_labels.append(int(row["is_duplicate"]))
binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
evaluators.append(binary_acc_evaluator)
###### Duplicate Questions Mining ######
# Given a large corpus of questions, identify all duplicates in that corpus.
# For faster processing, we limit the development corpus to only 10,000 sentences.
max_dev_samples = 10000
dev_sentences = {}
dev_duplicates = []
with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_sentences[row["qid"]] = row["question"]
if len(dev_sentences) >= max_dev_samples:
break
with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["qid1"] in dev_sentences and row["qid2"] in dev_sentences:
dev_duplicates.append([row["qid1"], row["qid2"]])
# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
# extracts a list with the pairs that have the highest similarity. Given the duplicate
# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(dev_sentences, dev_duplicates, name="dev")
evaluators.append(paraphrase_mining_evaluator)
###### Duplicate Questions Information Retrieval ######
# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
# in that corpus.
# For faster processing, we limit the development corpus to only 10,000 sentences.
max_corpus_size = 10000
ir_queries = {} # Our queries (qid => question)
ir_needed_qids = set() # QIDs we need in the corpus
ir_corpus = {} # Our corpus (qid => question)
ir_relevant_docs = {} # Mapping of relevant documents for a given query (qid => set([relevant_question_ids])
with open(os.path.join(dataset_path, "information-retrieval/dev-queries.tsv"), encoding="utf8") as fIn:
next(fIn) # Skip header
for line in fIn:
qid, query, duplicate_ids = line.strip().split("\t")
duplicate_ids = duplicate_ids.split(",")
ir_queries[qid] = query
ir_relevant_docs[qid] = set(duplicate_ids)
for qid in duplicate_ids:
ir_needed_qids.add(qid)
# First get all needed relevant documents (i.e., we must ensure, that the relevant questions are actually in the corpus
distraction_questions = {}
with open(os.path.join(dataset_path, "information-retrieval/corpus.tsv"), encoding="utf8") as fIn:
next(fIn) # Skip header
for line in fIn:
qid, question = line.strip().split("\t")
if qid in ir_needed_qids:
ir_corpus[qid] = question
else:
distraction_questions[qid] = question
# Now, also add some irrelevant questions to fill our corpus
other_qid_list = list(distraction_questions.keys())
random.shuffle(other_qid_list)
for qid in other_qid_list[0 : max(0, max_corpus_size - len(ir_corpus))]:
ir_corpus[qid] = distraction_questions[qid]
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
# metrices. For our use case MRR@k and Accuracy@k are relevant.
ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)
evaluators.append(ir_evaluator)
# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
seq_evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
logger.info("Evaluate model without training")
seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=seq_evaluator,
epochs=num_epochs,
warmup_steps=1000,
output_path=model_save_path,
)
"""
This scripts demonstrates how to train a sentence embedding model for question pair classification
with cosine-similarity and a simple threshold.
As dataset, we use Quora Duplicates Questions, where we have labeled pairs of questions being either duplicates (label 1) or non-duplicate (label 0).
As loss function, we use OnlineConstrativeLoss. It reduces the distance between positive pairs, i.e., it pulls the embeddings of positive pairs closer together. For negative pairs, it pushes them further apart.
An issue with constrative loss is, that it might push sentences away that are already well positioned in vector space.
"""
from torch.utils.data import DataLoader
from sentence_transformers import losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import csv
import os
from zipfile import ZipFile
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
#### /print debug information to stdout
# As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
model = SentenceTransformer("stsb-distilbert-base")
num_epochs = 10
train_batch_size = 64
# As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
# Negative pairs should have a distance of at least 0.5
margin = 0.5
dataset_path = "quora-IR-dataset"
model_save_path = "output/training_OnlineConstrativeLoss-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(model_save_path, exist_ok=True)
# Check if the dataset exists. If not, download and extract
if not os.path.exists(dataset_path):
logger.info("Dataset not found. Download")
zip_save_path = "quora-IR-dataset.zip"
util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
with ZipFile(zip_save_path, "r") as zip:
zip.extractall(dataset_path)
######### Read train data ##########
# Read train data
train_samples = []
with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
sample = InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"]))
train_samples.append(sample)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
################### Development Evaluators ##################
# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
evaluators = []
###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_sentences1.append(row["question1"])
dev_sentences2.append(row["question2"])
dev_labels.append(int(row["is_duplicate"]))
binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
evaluators.append(binary_acc_evaluator)
###### Duplicate Questions Mining ######
# Given a large corpus of questions, identify all duplicates in that corpus.
# For faster processing, we limit the development corpus to only 10,000 sentences.
max_dev_samples = 10000
dev_sentences = {}
dev_duplicates = []
with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_sentences[row["qid"]] = row["question"]
if len(dev_sentences) >= max_dev_samples:
break
with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["qid1"] in dev_sentences and row["qid2"] in dev_sentences:
dev_duplicates.append([row["qid1"], row["qid2"]])
# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
# extracts a list with the pairs that have the highest similarity. Given the duplicate
# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(dev_sentences, dev_duplicates, name="dev")
evaluators.append(paraphrase_mining_evaluator)
###### Duplicate Questions Information Retrieval ######
# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
# in that corpus.
# For faster processing, we limit the development corpus to only 10,000 sentences.
max_corpus_size = 100000
ir_queries = {} # Our queries (qid => question)
ir_needed_qids = set() # QIDs we need in the corpus
ir_corpus = {} # Our corpus (qid => question)
ir_relevant_docs = {} # Mapping of relevant documents for a given query (qid => set([relevant_question_ids])
with open(os.path.join(dataset_path, "information-retrieval/dev-queries.tsv"), encoding="utf8") as fIn:
next(fIn) # Skip header
for line in fIn:
qid, query, duplicate_ids = line.strip().split("\t")
duplicate_ids = duplicate_ids.split(",")
ir_queries[qid] = query
ir_relevant_docs[qid] = set(duplicate_ids)
for qid in duplicate_ids:
ir_needed_qids.add(qid)
# First get all needed relevant documents (i.e., we must ensure, that the relevant questions are actually in the corpus
distraction_questions = {}
with open(os.path.join(dataset_path, "information-retrieval/corpus.tsv"), encoding="utf8") as fIn:
next(fIn) # Skip header
for line in fIn:
qid, question = line.strip().split("\t")
if qid in ir_needed_qids:
ir_corpus[qid] = question
else:
distraction_questions[qid] = question
# Now, also add some irrelevant questions to fill our corpus
other_qid_list = list(distraction_questions.keys())
random.shuffle(other_qid_list)
for qid in other_qid_list[0 : max(0, max_corpus_size - len(ir_corpus))]:
ir_corpus[qid] = distraction_questions[qid]
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
# metrices. For our use case MRR@k and Accuracy@k are relevant.
ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)
evaluators.append(ir_evaluator)
# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
seq_evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
logger.info("Evaluate model without training")
seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=seq_evaluator,
epochs=num_epochs,
warmup_steps=1000,
output_path=model_save_path,
)
"""
This script combines training_OnlineContrastiveLoss.py with training_MultipleNegativesRankingLoss.py
Online constrative loss works well for classification (are question1 and question2 duplicates?), but it
performs less well for duplicate questions mining. MultipleNegativesRankingLoss works well for duplicate
questions mining, but it has some issues with classification as it does not push dissimilar pairs away.
This script combines both losses to get the best of both worlds.
Multi task learning is achieved quite easily by calling the model.fit method like this:
model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_constrative_loss, train_loss_constrative_loss)] ...)
"""
from torch.utils.data import DataLoader
from sentence_transformers import losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import csv
import os
from zipfile import ZipFile
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
#### /print debug information to stdout
# As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
model = SentenceTransformer("stsb-distilbert-base")
# Training for multiple epochs can be beneficial, as in each epoch a mini-batch is sampled differently
# hence, we get different negatives for each positive
num_epochs = 10
# Increasing the batch size improves the performance for MultipleNegativesRankingLoss. Choose it as large as possible
# I achieved the good results with a batch size of 300-350 (requires about 30 GB of GPU memory)
train_batch_size = 64
# As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
# Negative pairs should have a distance of at least 0.5
margin = 0.5
dataset_path = "quora-IR-dataset"
model_save_path = "output/training_multi-task-learning" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(model_save_path, exist_ok=True)
# Check if the dataset exists. If not, download and extract
if not os.path.exists(dataset_path):
logger.info("Dataset not found. Download")
zip_save_path = "quora-IR-dataset.zip"
util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
with ZipFile(zip_save_path, "r") as zip:
zip.extractall(dataset_path)
######### Read train data ##########
train_samples_MultipleNegativesRankingLoss = []
train_samples_ConstrativeLoss = []
with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
train_samples_ConstrativeLoss.append(
InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"]))
)
if row["is_duplicate"] == "1":
train_samples_MultipleNegativesRankingLoss.append(
InputExample(texts=[row["question1"], row["question2"]], label=1)
)
train_samples_MultipleNegativesRankingLoss.append(
InputExample(texts=[row["question2"], row["question1"]], label=1)
) # if A is a duplicate of B, then B is a duplicate of A
# Create data loader and loss for MultipleNegativesRankingLoss
train_dataloader_MultipleNegativesRankingLoss = DataLoader(
train_samples_MultipleNegativesRankingLoss, shuffle=True, batch_size=train_batch_size
)
train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(model)
# Create data loader and loss for OnlineContrastiveLoss
train_dataloader_ConstrativeLoss = DataLoader(train_samples_ConstrativeLoss, shuffle=True, batch_size=train_batch_size)
train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)
################### Development Evaluators ##################
# We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification,
# Duplicate Questions Mining, and Duplicate Questions Information Retrieval
evaluators = []
###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_sentences1.append(row["question1"])
dev_sentences2.append(row["question2"])
dev_labels.append(int(row["is_duplicate"]))
binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
evaluators.append(binary_acc_evaluator)
###### Duplicate Questions Mining ######
# Given a large corpus of questions, identify all duplicates in that corpus.
# For faster processing, we limit the development corpus to only 10,000 sentences.
max_dev_samples = 10000
dev_sentences = {}
dev_duplicates = []
with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_sentences[row["qid"]] = row["question"]
if len(dev_sentences) >= max_dev_samples:
break
with open(os.path.join(dataset_path, "duplicate-mining/dev_duplicates.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["qid1"] in dev_sentences and row["qid2"] in dev_sentences:
dev_duplicates.append([row["qid1"], row["qid2"]])
# The ParaphraseMiningEvaluator computes the cosine similarity between all sentences and
# extracts a list with the pairs that have the highest similarity. Given the duplicate
# information in dev_duplicates, it then computes and F1 score how well our duplicate mining worked
paraphrase_mining_evaluator = evaluation.ParaphraseMiningEvaluator(dev_sentences, dev_duplicates, name="dev")
evaluators.append(paraphrase_mining_evaluator)
###### Duplicate Questions Information Retrieval ######
# Given a question and a large corpus of thousands questions, find the most relevant (i.e. duplicate) question
# in that corpus.
# For faster processing, we limit the development corpus to only 10,000 sentences.
max_corpus_size = 100000
ir_queries = {} # Our queries (qid => question)
ir_needed_qids = set() # QIDs we need in the corpus
ir_corpus = {} # Our corpus (qid => question)
ir_relevant_docs = {} # Mapping of relevant documents for a given query (qid => set([relevant_question_ids])
with open(os.path.join(dataset_path, "information-retrieval/dev-queries.tsv"), encoding="utf8") as fIn:
next(fIn) # Skip header
for line in fIn:
qid, query, duplicate_ids = line.strip().split("\t")
duplicate_ids = duplicate_ids.split(",")
ir_queries[qid] = query
ir_relevant_docs[qid] = set(duplicate_ids)
for qid in duplicate_ids:
ir_needed_qids.add(qid)
# First get all needed relevant documents (i.e., we must ensure, that the relevant questions are actually in the corpus
distraction_questions = {}
with open(os.path.join(dataset_path, "information-retrieval/corpus.tsv"), encoding="utf8") as fIn:
next(fIn) # Skip header
for line in fIn:
qid, question = line.strip().split("\t")
if qid in ir_needed_qids:
ir_corpus[qid] = question
else:
distraction_questions[qid] = question
# Now, also add some irrelevant questions to fill our corpus
other_qid_list = list(distraction_questions.keys())
random.shuffle(other_qid_list)
for qid in other_qid_list[0 : max(0, max_corpus_size - len(ir_corpus))]:
ir_corpus[qid] = distraction_questions[qid]
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
# metrices. For our use case MRR@k and Accuracy@k are relevant.
ir_evaluator = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs)
evaluators.append(ir_evaluator)
# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
seq_evaluator = evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
logger.info("Evaluate model without training")
seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)
# Train the model
model.fit(
train_objectives=[
(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss),
(train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss),
],
evaluator=seq_evaluator,
epochs=num_epochs,
warmup_steps=1000,
output_path=model_save_path,
)
# Semantic Textual Similarity
Semantic Textual Similarity (STS) assigns a score on the similarity of two texts. In this example, we use the [STSbenchmark](https://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) as training data to fine-tune our network. See the following example scripts how to tune SentenceTransformer on STS data:
- **[training_stsbenchmark.py](training_stsbenchmark.py)** - This example shows how to create a SentenceTransformer model from scratch by using a pre-trained transformer model together with a pooling layer.
- **[training_stsbenchmark_continue_training.py](training_stsbenchmark_continue_training.py)** - This example shows how to continue training on STS data for a previously created & trained SentenceTransformer model. In that example, we load a model trained on [NLI data](../nli/README.md).
## Training data
In STS, we have sentence pairs annotated together with a score indicating the similarity. For the [STSbenchmark](https://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark), the scores ranges from 0 (the content of the two sentences are competely different) up to 5 (the two sentences are identical in terms of their meaning). To train our network, we need to normalize these scores to a range of 0-1. This can simply be done by dividing the score by 5.
To store our training data, we create a list with `InputExample` objects. Each `InputExample` contains the sentence pair together with the label (score) that ranges between 0 - 1. A simplified version how the training data has to look like is the following:
```python
from sentence_transformers import (
SentenceTransformer,
SentencesDataset,
InputExample,
losses,
)
model = SentenceTransformer("nli-distilroberta-base-v2")
train_examples = [
InputExample(texts=["My first sentence", "My second sentence"], label=0.8),
InputExample(texts=["Another pair", "Unrelated sentence"], label=0.3),
]
train_dataset = SentencesDataset(train_examples, model)
```
## Loss Function
As loss function we use [CosineSimilarityLoss](../../../docs/package_reference/losses.html#cosinesimilarityloss).
*CosineSimilarityLoss* trains the network with a siamese network structure (for details see: [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084))
![SBERT Siamese Network Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_Siamese_Network.png "SBERT Siamese Architecture")
For each sentence pair, we pass sentence A and sentence B through our network which yields the embeddings *u* und *v*. The similarity of these embeddings is computed using cosine similarity and the result is compared to the gold similarity score. This allows our network to be fine-tuned and to recognize the similarity of sentences.
This training in a siamese network structure is done automatically when we use CosineSimilarityLoss.
"""
This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
that can be compared using cosine-similarity to measure the similarity.
Usage:
python training_nli.py
OR
python training_nli.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = (
"output/training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)
"""
This example loads the pre-trained SentenceTransformer model 'nli-distilroberta-base-v2' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Read the dataset
model_name = "nli-distilroberta-base-v2"
train_batch_size = 16
num_epochs = 4
model_save_path = (
"output/training_stsbenchmark_continue_training-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)
# CT
Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF) ([Github](https://github.com/FreddeFrallan/Contrastive-Tension)) an unsupervised learning approach for sentence embeddings that just requires sentences.
## Background
During training, CT builds two independent encoders ('Model1' and 'Model2') with initial parameters shared to encode a pair of sentences. If Model1 and Model2 encode the same sentence, then the dot-product of the two sentence embeddings should be large. If Model1 and Model2 encode different sentences, then their dot-product should be small.
The original CT paper uses batchs that contain multiple mini-batches. For the example of K=7, each mini-batch consists of sentence pairs (S_A, S_A), (S_A, S_B), (S_A, S_C), ..., (S_A, S_H) and the corresponding labels are 1, 0, 0, ..., 0. In other words, one identical pair of sentences is viewed as the positive example and other pairs of different sentences are viewed as the negative examples (i.e. 1 positive + K negative pairs). The training objective is the binary cross-entropy between the generated similarity scores and labels. This example is illustrated in the figure (from the Appendix A.1 of the CT paper) below:
![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
After training, the model 2 will be used for inference, which usually has better performance.
In **[CT_Improved](../CT_In-Batch_Negatives/README.md)** we propose an improvement to CT by using in-batch negative sampling.
## Performance
In some preliminary experiments, we compate performance on the STSbenchmark dataset (trained with 1 million sentences from Wikipedia) and on the paraphrase mining task for the Quora duplicate questions dataset (trained with questions from Quora).
| Method | STSb (Spearman) | Quora-Duplicate-Question (Avg. Precision) |
| --- | :---: | :---:
| CT | 75.7 | 36.5
| CT-Improved | 78.5 | 40.1
Note: We used the code provided in this repository, not the official code from the authors.
## CT from Sentences File
**[train_ct_from_file.py](train_ct_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
## Further Training Examples
- **[train_stsb_ct.py](train_stsb_ct.py)**: This example uses 1 million sentences from Wikipedia to train with CT. It evaluate the performance on the [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
- **[train_askubuntu_ct.py](train_askubuntu_ct.py)**: This example trains on [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu), a dataset with questions from the AskUbuntu Stackexchange forum.
**Note:**
This is a re-implementation of CT within sentence-transformers. For the official CT code, see: [FreddeFrallan/Contrastive-Tension](https://github.com/FreddeFrallan/Contrastive-Tension)
\ No newline at end of file
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, evaluation, losses
import logging
import os
import gzip
from datetime import datetime
import torch
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Some training parameters. We use a batch size of 16, for every positive example we include 8-1=7 negative examples
# Sentences are truncated to 75 word pieces
model_name = "distilbert-base-uncased"
batch_size = 16
pos_neg_ratio = 8 # batch_size must be devisible by pos_neg_ratio
max_seq_length = 75
num_epochs = 1
################# Download AskUbuntu and extract training corpus #################
askubuntu_folder = "askubuntu"
output_path = "output/train_askubuntu_ct-{}-{}-{}".format(
model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
filepath = os.path.join(askubuntu_folder, filename)
if not os.path.exists(filepath):
util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
id = splits[0]
title = splits[1]
corpus[id] = title
# Read dev & test dataset
def read_eval_dataset(filepath):
dataset = []
with open(filepath) as fIn:
for line in fIn:
query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
if len(relevant_id) == 0: # Skip examples without relevant entries
continue
relevant_id = relevant_id.split(" ")
candidate_ids = candidate_ids.split(" ")
negative_ids = set(candidate_ids) - set(relevant_id)
dataset.append(
{
"query": corpus[query_id],
"positive": [corpus[pid] for pid in relevant_id],
"negative": [corpus[pid] for pid in negative_ids],
}
)
dev_test_ids.add(query_id)
dev_test_ids.update(candidate_ids)
return dataset
dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
## Now we need a list of train sentences.
## In this example we simply use all sentences that don't appear in the train/dev set
train_sentences = []
for id, sentence in corpus.items():
if id not in dev_test_ids:
train_sentences.append(sentence)
logging.info("{} train sentences".format(len(train_sentences)))
################# Initialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Train the model #################
# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader = losses.ContrastiveTensionDataLoader(
train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio
)
# As loss, we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLoss(model)
# Create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
logging.info("Start training")
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=1,
weight_decay=0,
warmup_steps=0,
optimizer_class=torch.optim.RMSprop,
optimizer_params={"lr": 1e-5},
use_amp=False, # Set to True, if your GPU has optimized FP16 cores
)
latest_output_path = output_path + "-latest"
model.save(latest_output_path)
### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
model = SentenceTransformer(latest_output_path)
test_evaluator(model)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment