Commit 0fccd232 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add

parents
Pipeline #1027 failed with stages
in 0 seconds
"""
This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
CT will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
Usage:
python train_ct_from_file.py path/to/sentences.txt
"""
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer
import logging
from datetime import datetime
import gzip
import sys
import tqdm
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
## Training parameters
model_name = "distilbert-base-uncased"
batch_size = 16
pos_neg_ratio = 8 # batch_size must be devisible by pos_neg_ratio
num_epochs = 1
max_seq_length = 75
# Input file path (a text file, each line a sentence)
if len(sys.argv) < 2:
print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
exit()
filepath = sys.argv[1]
# Save path to store our model
output_name = ""
if len(sys.argv) >= 3:
output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
model_output_path = "output/train_ct{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Read the train corpus #################
train_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
train_sentences.append(line)
logging.info("Train sentences: {}".format(len(train_sentences)))
# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader = losses.ContrastiveTensionDataLoader(
train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio
)
# As loss, we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLoss(model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=num_epochs,
warmup_steps=warmup_steps,
optimizer_params={"lr": 5e-5},
checkpoint_path=model_output_path,
show_progress_bar=True,
use_amp=False, # Set to True, if your GPU supports FP16 cores
)
import torch
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, InputExample
from sentence_transformers import losses
import os
import gzip
import csv
from datetime import datetime
import logging
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
## Training parameters
model_name = "distilbert-base-uncased"
batch_size = 16
pos_neg_ratio = 8 # batch_size must be devisible by pos_neg_ratio
epochs = 1
max_seq_length = 75
# Save path to store our model
model_save_path = "output/train_stsb_ct-{}-{}".format(model_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
################# Train sentences #################
# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
if not os.path.exists(wikipedia_dataset_path):
util.http_get(
"https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
wikipedia_dataset_path,
)
# train_sentences are simply your list of sentences
train_sentences = []
with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
for line in fIn:
line = line.strip()
if len(line) >= 10:
train_sentences.append(line)
################# Download and load STSb #################
data_folder = "data/stsbenchmark"
sts_dataset_path = f"{data_folder}/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
################# Initialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader = losses.ContrastiveTensionDataLoader(
train_sentences, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio
)
# As loss, we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLoss(model)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=1,
evaluation_steps=1000,
weight_decay=0,
warmup_steps=0,
optimizer_class=torch.optim.RMSprop,
optimizer_params={"lr": 1e-5},
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU has optimized FP16 cores
)
########### Load the model and evaluate on test set
model = SentenceTransformer(model_save_path)
test_evaluator(model)
# CT (In-Batch Negatives)
Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF) an unsupervised learning approach for sentence embeddings that just requires sentences.
## Background
During training, CT builds two independent encoders ('Model1' and 'Model2') with initial parameters shared to encode a pair of sentences. If Model1 and Model2 encode the same sentence, then the dot-product of the two sentence embeddings should be large. If Model1 and Model2 encode different sentences, then their dot-product should be small.
In the original CT paper, specially created batches are used. We implemented an improved version that uses in-batch negative sampling: Model1 and Model2 both encode the same set of sentences. We maximize the scores for matching indexes (i.e. Model1(S_i) and Model2(S_i)) while we minimize the scores for different indexes (i.e. Model1(S_i) and Model2(S_j) for i != j).
Using in-batch negative sampling gives a stronger training signal than the original loss function proposed by Carlsson et al.
![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
After training, the model 2 will be used for inference, which usually has better performance.
## Performance
In some preliminary experiments, we compare performance on the STSbenchmark dataset (trained with 1 million sentences from Wikipedia) and on the Quora duplicate questions dataset (trained with questions from Quora).
| Method | STSb (Spearman) | Quora-Duplicate-Question (Avg. Precision) |
| --- | :---: | :---:
| CT | 75.7 | 36.5
| CT (In-Batch Negatives) | 78.5 | 40.1
Note: We used the code provided in this repository, not the official code from the authors.
## CT from Sentences File
**[train_ct-improved_from_file.py](train_ct-improved_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
## Further Training Examples
- **[train_stsb_ct-improved.py](train_stsb_ct-improved.py)**: This example uses 1 million sentences from Wikipedia to train with CT. It evaluate the performance on the [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
- **[train_askubuntu_ct-improved.py](train_askubuntu_ct-improved.py)**: This example trains on [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu), a dataset with questions from the AskUbuntu Stackexchange forum.
\ No newline at end of file
from sentence_transformers import SentenceTransformer, LoggingHandler, InputExample
from sentence_transformers import models, util, evaluation, losses
import logging
import os
import gzip
from datetime import datetime
from torch.utils.data import DataLoader
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Some training parameters. We use a batch size of 16, for every positive example we include 8-1=7 negative examples
# Sentences are truncated to 75 word pieces
## Training parameters
model_name = "distilbert-base-uncased"
batch_size = 128
epochs = 1
max_seq_length = 75
################# Download AskUbuntu and extract training corpus #################
askubuntu_folder = "askubuntu"
output_path = "output/train_askubuntu_ct-improved-{}-{}-{}".format(
model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
filepath = os.path.join(askubuntu_folder, filename)
if not os.path.exists(filepath):
util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
id = splits[0]
title = splits[1]
corpus[id] = title
# Read dev & test dataset
def read_eval_dataset(filepath):
dataset = []
with open(filepath) as fIn:
for line in fIn:
query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
if len(relevant_id) == 0: # Skip examples without relevant entries
continue
relevant_id = relevant_id.split(" ")
candidate_ids = candidate_ids.split(" ")
negative_ids = set(candidate_ids) - set(relevant_id)
dataset.append(
{
"query": corpus[query_id],
"positive": [corpus[pid] for pid in relevant_id],
"negative": [corpus[pid] for pid in negative_ids],
}
)
dev_test_ids.add(query_id)
dev_test_ids.update(candidate_ids)
return dataset
dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
## Now we need a list of train sentences.
## In this example we simply use all sentences that don't appear in the train/dev set
train_sentences = []
for id, sentence in corpus.items():
if id not in dev_test_ids:
train_sentences.append(InputExample(texts=[sentence, sentence]))
logging.info("{} train sentences".format(len(train_sentences)))
################# Initialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Train the model #################
# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
# As loss, we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
# Create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
logging.info("Start training")
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=1,
warmup_steps=100,
use_amp=True, # Set to True, if your GPU has optimized FP16 cores
)
latest_output_path = output_path + "-latest"
model.save(latest_output_path)
### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
model = SentenceTransformer(latest_output_path)
test_evaluator(model)
"""
This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
CT will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
Usage:
python train_ct_from_file.py path/to/sentences.txt
"""
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer
import logging
from datetime import datetime
import gzip
import sys
import tqdm
from torch.utils.data import DataLoader
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
## Training parameters
model_name = "distilbert-base-uncased"
batch_size = 128
num_epochs = 1
max_seq_length = 75
# Input file path (a text file, each line a sentence)
if len(sys.argv) < 2:
print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
exit()
filepath = sys.argv[1]
# Save path to store our model
output_name = ""
if len(sys.argv) >= 3:
output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
model_output_path = "output/train_ct-improved{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Read the train corpus #################
train_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
train_sentences.append(line)
logging.info("Train sentences: {}".format(len(train_sentences)))
# A regular torch DataLoader and as loss we use losses.ContrastiveTensionLossInBatchNegatives
train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=num_epochs,
warmup_steps=warmup_steps,
optimizer_params={"lr": 5e-5},
checkpoint_path=model_output_path,
show_progress_bar=True,
use_amp=False, # Set to True, if your GPU supports FP16 cores
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, models, util, InputExample
from sentence_transformers import losses
import os
import gzip
import csv
from datetime import datetime
import logging
from torch.utils.data import DataLoader
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
## Training parameters
model_name = "distilbert-base-uncased"
batch_size = 128
epochs = 1
max_seq_length = 75
# Save path to store our model
model_save_path = "output/training_stsb_ct-improved-{}-{}".format(
model_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
################# Train sentences #################
# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
if not os.path.exists(wikipedia_dataset_path):
util.http_get(
"https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
wikipedia_dataset_path,
)
# train_sentences are simply your list of sentences
train_sentences = []
with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
for line in fIn:
train_sentences.append(InputExample(texts=[line.strip(), line.strip()]))
################# Download and load STSb #################
data_folder = "data/stsbenchmark"
sts_dataset_path = f"{data_folder}/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
################# Initialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# For ContrastiveTension we need a special data loader to construct batches with the desired properties
train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
# As loss, we losses.ContrastiveTensionLoss
train_loss = losses.ContrastiveTensionLossInBatchNegatives(model, scale=1, similarity_fct=util.dot_score)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=1,
evaluation_steps=1000,
warmup_steps=1000,
output_path=model_save_path,
optimizer_params={"lr": 5e-5},
use_amp=True, # Set to True, if your GPU supports FP16 cores
)
########### Load the model and evaluate on test set
model = SentenceTransformer(model_save_path)
test_evaluator(model)
# MLM
Masked Language Model (MLM) is the process how BERT was pre-trained. It has been shown, that to continue MLM on your own data can improve performances (see [Don't Stop Pretraining: Adapt Language Models to Domains and Tasks](https://arxiv.org/abs/2004.10964)). In our [TSDAE-paper](https://arxiv.org/abs/2104.06979) we also show that MLM is a powerful pre-training strategy for learning sentence embeddings. This is especially the case when you work on some specialized domain.
**Note:** Only running MLM will not yield good sentence embeddings. But you can first tune your favorite transformer model with MLM on your domain specific data. Then you can fine-tune the model with the labeled data you have or using other data sets like [NLI](../../training/nli/README.md), [Paraphrases](../../training/paraphrases/README.md), or [STS](../../training/sts/README.md).
![MLM working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/MLM.png)
## Running MLM
The **[train_mlm.py](train_mlm.py)** script provides an easy option to run MLM on your data. You run this script by:
```bash
python train_mlm.py distilbert-base path/train.txt
```
You can also provide an optional dev dataset:
```bash
python train_mlm.py distilbert-base path/train.txt path/dev.txt
```
Each line in train.txt / dev.txt is interpreted as one input for the transformer network, i.e. as one sentence or paragraph.
For more information how to run MLM with huggingface transformers, see the [Language model training examples](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling).
"""
This file runs Masked Language Model. You provide a training file. Each line is interpreted as a sentence / paragraph.
Optionally, you can also provide a dev file.
The fine-tuned model is stored in the output/model_name folder.
Usage:
python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]
"""
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import sys
import gzip
from datetime import datetime
if len(sys.argv) < 3:
print("Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]")
exit()
model_name = sys.argv[1]
per_device_train_batch_size = 64
save_steps = 1000 # Save model every 1k steps
num_train_epochs = 3 # Number of epochs
use_fp16 = False # Set to True, if your GPU supports FP16 operations
max_length = 100 # Max length for a text input
do_whole_word_mask = True # If set to true, whole words are masked
mlm_prob = 0.15 # Probability that a word is replaced by a [MASK] token
# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
output_dir = "output/{}-{}".format(model_name.replace("/", "_"), datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
print("Save checkpoints to:", output_dir)
##### Load our training datasets
train_sentences = []
train_path = sys.argv[2]
with gzip.open(train_path, "rt", encoding="utf8") if train_path.endswith(".gz") else open(
train_path, "r", encoding="utf8"
) as fIn:
for line in fIn:
line = line.strip()
if len(line) >= 10:
train_sentences.append(line)
print("Train sentences:", len(train_sentences))
dev_sentences = []
if len(sys.argv) >= 4:
dev_path = sys.argv[3]
with gzip.open(dev_path, "rt", encoding="utf8") if dev_path.endswith(".gz") else open(
dev_path, "r", encoding="utf8"
) as fIn:
for line in fIn:
line = line.strip()
if len(line) >= 10:
dev_sentences.append(line)
print("Dev sentences:", len(dev_sentences))
# A dataset wrapper, that tokenizes our data on-the-fly
class TokenizedSentencesDataset:
def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
self.tokenizer = tokenizer
self.sentences = sentences
self.max_length = max_length
self.cache_tokenization = cache_tokenization
def __getitem__(self, item):
if not self.cache_tokenization:
return self.tokenizer(
self.sentences[item],
add_special_tokens=True,
truncation=True,
max_length=self.max_length,
return_special_tokens_mask=True,
)
if isinstance(self.sentences[item], str):
self.sentences[item] = self.tokenizer(
self.sentences[item],
add_special_tokens=True,
truncation=True,
max_length=self.max_length,
return_special_tokens_mask=True,
)
return self.sentences[item]
def __len__(self):
return len(self.sentences)
train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
dev_dataset = (
TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True)
if len(dev_sentences) > 0
else None
)
##### Training arguments
if do_whole_word_mask:
data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=num_train_epochs,
evaluation_strategy="steps" if dev_dataset is not None else "no",
per_device_train_batch_size=per_device_train_batch_size,
eval_steps=save_steps,
save_steps=save_steps,
logging_steps=save_steps,
save_total_limit=1,
prediction_loss_only=True,
fp16=use_fp16,
)
trainer = Trainer(
model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=dev_dataset
)
print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)
trainer.train()
print("Save model to:", output_dir)
model.save_pretrained(output_dir)
print("Training done")
# Unsupervised Learning
This page contains a collection of unsupervised learning methods to learn sentence embeddings. The methods have in common that they **do not require labeled training data**. Instead, they can learn semantically meaningful sentence embeddings just from the text itself.
**Note:** Unsupervised learning approaches are still an activate research area and in many cases the models perform rather poorly compared to models that are using training pairs as provided in our [training data collection](https://huggingface.co/datasets/sentence-transformers/embedding-training-data). A better approach is **[Domain Adaptation](../domain_adaptation/README.md)** where you combine unsupervised learning on your target domain with existent labeled data. This gives the best performance on your specific corpus.
## TSDAE
In our work [TSDAE (Transformer-based Denoising AutoEncoder)](https://arxiv.org/abs/2104.06979) we present an unsupervised sentence embedding learning method based on denoising auto-encoders:
![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/TSDAE.png)
We add noise to the input text, in our case, we delete about 60% of the words in the text. The encoder maps this input to a fixed-sized sentence embeddings. A decoder then tries to re-create the original text without the noise. Later, we use the encoder as the sentence embedding methods.
See **[TSDAE](TSDAE/README.md)** for more information and training examples.
## SimCSE
Gao et al. present in [SimCSE: Simple Contrastive Learning of Sentence Embeddings](https://arxiv.org/abs/2104.08821) a method that passes the same sentence twice to the sentence embedding encoder. Due to the drop-out, it will be encoded at slightly different positions in vector space.
The distance between these two embeddings will be minimized, while the distance to other embeddings of the other sentences in the same batch will be maximized.
![SimCSE working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SimCSE.png)
See **[SimCSE](SimCSE/README.md)** for more information and training examples.
## CT
Carlsson et al. present in [Semantic Re-Tuning With Contrastive Tension (CT)](https://openreview.net/pdf?id=Ov_sMNau-PF) an unsupervised method that uses two models: If the same sentences are passed to Model1 and Model2, then the respective sentence embeddings should get a large dot-score. If the different sentences are passed, then the sentence embeddings should get a low score.
![CT working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/CT.jpg)
See **[CT](CT/README.md)** for more information and training examples.
## CT (In-Batch Negative Sampling)
The CT method from Carlsson et al. provides sentence pairs to the two models. This can be improved by using in-batch negative sampling: Model1 and Model2 both encode the same set of sentences. We maximize the scores for matching indexes (i.e. Model1(S_i) and Model2(S_i)) while we minimize the scores for different indexes (i.e. Model1(S_i) and Model2(S_j) for i != j).
See **[CT_In-Batch_Negatives](CT_In-Batch_Negatives/README.md)** for more information and training examples.
## Masked Language Model (MLM)
BERT showed that Masked Language Model (MLM) is a powerful pre-training approach. It is advisable to first run MLM a large dataset from your domain before you do fine-tuning. See **[MLM](MLM/README.md)** for more information and training examples.
## GenQ
In our paper [BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models](https://arxiv.org/abs/2104.08663) we present a method to learn a semantic search method by generating queries for given passages. This method has been improved in [GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577).
We pass all passages in our collection through a trained T5 model, which generates potential queries from users. We then use these (query, passage) pairs to train a SentenceTransformer model.
![Query Generation](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/query-generation.png)
See **[GenQ](query_generation/README.md)** for more information and training examples. See **[GPL](../domain_adaptation/README.md)** for the improved version that uses a multi-step training approach.
## GPL
In [GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577) we show an improved version of GenQ, which combines the generation with negative mining and pseudo labeling using a Cross-Encoder. It leads to significantly improved results. See **[Domain Adaptation](../domain_adaptation/README.md)** for more information.
![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_architecture.png)
## Performance Comparison
In our paper
[TSDAE](https://arxiv.org/abs/2104.06979) we compare approaches for sentence embedding tasks, and in [GPL](https://arxiv.org/abs/2112.07577) we compare them for semantic search tasks (given a query, find relevant passages). While the unsupervised approach achieve acceptable performances for sentence embedding tasks, they perform poorly for semantic search tasks.
# SimCSE
Gao et al. present in [SimCSE](https://arxiv.org/abs/2104.08821) a simple method to train sentence embeddings without having training data.
The idea is to encode the same sentence twice. Due to the used dropout in transformer models, both sentence embeddings will be at slightly different positions. The distance between these two embeddings will be minimized, while the distance to other embeddings of the other sentences in the same batch will be maximized (they serve as negative examples).
![SimCSE working](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SimCSE.png)
## Usage with SentenceTransformers
SentenceTransformers implements the [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss), which makes training with SimCSE trivial:
```python
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers import models, losses
from torch.utils.data import DataLoader
# Define your sentence transformer model using CLS pooling
model_name = "distilroberta-base"
word_embedding_model = models.Transformer(model_name, max_seq_length=32)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Define a list with sentences (1k - 100k sentences)
train_sentences = [
"Your set of sentences",
"Model will automatically add the noise",
"And re-construct it",
"You should provide at least 1k sentences",
]
# Convert train sentences to sentence pairs
train_data = [InputExample(texts=[s, s]) for s in train_sentences]
# DataLoader to batch your data
train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
# Use the denoising auto-encoder loss
train_loss = losses.MultipleNegativesRankingLoss(model)
# Call the fit method
model.fit(
train_objectives=[(train_dataloader, train_loss)], epochs=1, show_progress_bar=True
)
model.save("output/simcse-model")
```
## SimCSE from Sentences File
**[train_simcse_from_file.py](train_simcse_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
## Training Examples
- **[train_askubuntu_simcse.py](train_askubuntu_simcse.py)** - Shows the example how to train with SimCSE on the [AskUbuntu Questions dataset](https://github.com/taolei87/askubuntu).
- **[train_stsb_simcse.py](train_stsb_simcse.py)** - This script uses 1 million sentences and evaluates SimCSE on the [STSbenchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark).
## Ablation Study
We use the evaluation setup proposed in our [TSDAE paper](https://arxiv.org/abs/2104.06979).
Using mean pooling, with max_seq_length=32 and batch_size=128
| Base Model | AskUbuntu Test-Performance (MAP) |
| ---- | :----: |
| distilbert-base-uncased | 53.59 |
| bert-base-uncased | 54.89 |
| **distilroberta-base** | **56.16** |
| roberta-base | 55.89 |
Using mean pooling, with max_seq_length=32 and distilroberta-base model.
| Batch Size | AskUbuntu Test-Performance (MAP) |
| ---- | :----: |
| 128 | 56.16 |
| 256 | 56.63 |
| **512** | **56.69** |
Using max_seq_length=32, distilroberta-base model, and 512 batch size.
| Pooling Mode | AskUbuntu Test-Performance (MAP) |
| ---- | :----: |
| **Mean pooling** | **56.69** |
| CLS pooling | 56.56 |
| Max pooling | 52.91 |
**Note:**
This is a re-implementation of SimCSE within sentence-transformers. For the official CT code, see: [princeton-nlp/SimCSE](https://github.com/princeton-nlp/SimCSE)
\ No newline at end of file
from sentence_transformers import SentenceTransformer, LoggingHandler, InputExample
from sentence_transformers import models, util, evaluation, losses
import logging
import os
import gzip
from torch.utils.data import DataLoader
from datetime import datetime
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Some training parameters. For the example, we use a batch_size of 128, a max sentence length (max_seq_length)
# of 32 word pieces and as model roberta-base
model_name = "roberta-base"
batch_size = 128
max_seq_length = 32
num_epochs = 1
################# Download AskUbuntu and extract training corpus #################
askubuntu_folder = "data/askubuntu"
output_path = "output/askubuntu-simcse-{}-{}-{}".format(
model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
filepath = os.path.join(askubuntu_folder, filename)
if not os.path.exists(filepath):
util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
id = splits[0]
title = splits[1]
corpus[id] = title
# Read dev & test dataset
def read_eval_dataset(filepath):
dataset = []
with open(filepath) as fIn:
for line in fIn:
query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
if len(relevant_id) == 0: # Skip examples without relevant entries
continue
relevant_id = relevant_id.split(" ")
candidate_ids = candidate_ids.split(" ")
negative_ids = set(candidate_ids) - set(relevant_id)
dataset.append(
{
"query": corpus[query_id],
"positive": [corpus[pid] for pid in relevant_id],
"negative": [corpus[pid] for pid in negative_ids],
}
)
dev_test_ids.add(query_id)
dev_test_ids.update(candidate_ids)
return dataset
dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
## Now we need a list of train sentences.
## In this example we simply use all sentences that don't appear in the train/dev set
train_sentences = []
for id, sentence in corpus.items():
if id not in dev_test_ids:
train_sentences.append(InputExample(texts=[sentence, sentence]))
logging.info("{} train sentences".format(len(train_sentences)))
################# Initialize an SBERT model #################
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
# Apply mean pooling
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Train the model #################
# As Loss function, we use MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_sentences, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)
# Create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
logging.info("Dev performance before training")
dev_evaluator(model)
warmup_steps = int(num_epochs * len(train_dataloader) * 0.1)
logging.info("Start training")
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
evaluation_steps=100,
epochs=num_epochs,
warmup_steps=warmup_steps,
output_path=output_path,
show_progress_bar=True,
use_amp=True, # If your GPU does not have FP16 cores, set use_amp=False
)
latest_output_path = output_path + "-latest"
model.save(latest_output_path)
### Run test evaluation on the latest model. This is equivalent to not having a dev dataset
model = SentenceTransformer(latest_output_path)
test_evaluator(model)
"""
This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
SimCSE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
Usage:
python train_simcse_from_file.py path/to/sentences.txt
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, InputExample
import logging
from datetime import datetime
import gzip
import sys
import tqdm
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Training parameters
model_name = "distilroberta-base"
train_batch_size = 128
max_seq_length = 32
num_epochs = 1
# Input file path (a text file, each line a sentence)
if len(sys.argv) < 2:
print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
exit()
filepath = sys.argv[1]
# Save path to store our model
output_name = ""
if len(sys.argv) >= 3:
output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
model_output_path = "output/train_simcse{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Read the train corpus #################
train_samples = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
train_samples.append(InputExample(texts=[line, line]))
logging.info("Train sentences: {}".format(len(train_samples)))
# We train our model using the MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=num_epochs,
warmup_steps=warmup_steps,
optimizer_params={"lr": 5e-5},
checkpoint_path=model_output_path,
show_progress_bar=True,
use_amp=False, # Set to True, if your GPU supports FP16 cores
)
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Training parameters
model_name = "distilbert-base-uncased"
train_batch_size = 128
num_epochs = 1
max_seq_length = 32
# Save path to store our model
model_save_path = "output/training_stsb_simcse-{}-{}-{}".format(
model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
if not os.path.exists(wikipedia_dataset_path):
util.http_get(
"https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
wikipedia_dataset_path,
)
# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
train_samples = []
with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
for line in fIn:
line = line.strip()
if len(line) >= 10:
train_samples.append(InputExample(texts=[line, line]))
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
if row["split"] == "dev":
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
elif row["split"] == "test":
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
)
# We train our model using the MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
evaluation_steps = int(len(train_dataloader) * 0.1) # Evaluate every 10% of the data
logging.info("Training sentences: {}".format(len(train_samples)))
logging.info("Warmup-steps: {}".format(warmup_steps))
logging.info("Performance before training")
dev_evaluator(model)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=evaluation_steps,
warmup_steps=warmup_steps,
output_path=model_save_path,
optimizer_params={"lr": 5e-5},
use_amp=True, # Set to True, if your GPU supports FP16 cores
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator(model, output_path=model_save_path)
# TSDAE
This section shows an example, of how we can train an unsupervised [TSDAE (Transformer-based Denoising AutoEncoder)](https://arxiv.org/abs/2104.06979) model with pure sentences as training data.
## Background
During training, TSDAE encodes damaged sentences into fixed-sized vectors and requires the decoder to reconstruct the original sentences from these sentence embeddings. For good reconstruction quality, the semantics must be captured well in the sentence embeddings from the encoder. Later, at inference, we only use the encoder for creating sentence embeddings. The architecture is illustrated in the figure below:
![](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/TSDAE.png)
## Unsupervised Training with TSDAE
Training with TSDAE is simple. You just need a set of sentences:
```python
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader
# Define your sentence transformer model using CLS pooling
model_name = "bert-base-uncased"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Define a list with sentences (1k - 100k sentences)
train_sentences = [
"Your set of sentences",
"Model will automatically add the noise",
"And re-construct it",
"You should provide at least 1k sentences",
]
# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(
model, decoder_name_or_path=model_name, tie_encoder_decoder=True
)
# Call the fit method
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=1,
weight_decay=0,
scheduler="constantlr",
optimizer_params={"lr": 3e-5},
show_progress_bar=True,
)
model.save("output/tsdae-model")
```
## TSDAE from Sentences File
**[train_tsdae_from_file.py](train_tsdae_from_file.py)** loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
## TSDAE on AskUbuntu Dataset
The [AskUbuntu dataset](https://github.com/taolei87/askubuntu) is a manually annotated dataset for the [AskUbuntu forum](https://askubuntu.com/). For 400 questions, experts annotated for each question 20 other questions if they are related or not. The questions are split into train & development set.
**[train_askubuntu_tsdae.py](train_askubuntu_tsdae.py)** - Shows an example how to train a model on AskUbuntu using only sentences without any labels. As sentences, we use the titles that are not used in the dev / test set.
| Model | MAP-Score on test set |
| ---- | :----: |
| TSDAE (bert-base-uncased) | 59.4 |
| **pretrained SentenceTransformer models** | |
| nli-bert-base | 50.7 |
| paraphrase-distilroberta-base-v1 | 54.8 |
| stsb-roberta-large | 54.6 |
----------------------
## TSDAE as Pre-Training Task
As we show in our [TSDAE paper](https://arxiv.org/abs/2104.06979), TSDAE also a powerful pre-training method outperforming the classical Mask Language Model (MLM) pre-training task.
You first train your model with the TSDAE loss. After you have trained for a certain number of steps / after the model converges, you can further fine-tune your pre-trained model like any other SentenceTransformer model.
## Citation
If you use the code for augmented sbert, feel free to cite our publication [TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning](https://arxiv.org/abs/2104.06979):
```bibtex
@article{wang-2021-TSDAE,
title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
author = "Wang, Kexin and Reimers, Nils and Gurevych, Iryna",
journal= "arXiv preprint arXiv:2104.06979",
month = "4",
year = "2021",
url = "https://arxiv.org/abs/2104.06979",
}
```
"""
This scripts runs the evaluation (dev & test) for the AskUbuntu dataset
Usage:
python eval_askubuntu.py [sbert_model_name_or_path]
"""
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import util, evaluation
import logging
import os
import gzip
import sys
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model = SentenceTransformer(sys.argv[1])
################# Download AskUbuntu and extract training corpus #################
askubuntu_folder = "askubuntu"
training_corpus = os.path.join(askubuntu_folder, "train.unsupervised.txt")
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
filepath = os.path.join(askubuntu_folder, filename)
if not os.path.exists(filepath):
util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
id = splits[0]
title = splits[1]
corpus[id] = title
# Read dev & test dataset
def read_eval_dataset(filepath):
dataset = []
with open(filepath) as fIn:
for line in fIn:
query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
if len(relevant_id) == 0: # Skip examples without relevant entries
continue
relevant_id = relevant_id.split(" ")
candidate_ids = candidate_ids.split(" ")
negative_ids = set(candidate_ids) - set(relevant_id)
dataset.append(
{
"query": corpus[query_id],
"positive": [corpus[pid] for pid in relevant_id],
"negative": [corpus[pid] for pid in negative_ids],
}
)
dev_test_ids.add(query_id)
dev_test_ids.update(candidate_ids)
return dataset
dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
# Create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
logging.info("Dev performance before training")
dev_evaluator(model)
test_evaluator = evaluation.RerankingEvaluator(test_dataset, name="AskUbuntu test")
logging.info("Test performance before training")
test_evaluator(model)
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
import logging
import os
import gzip
from torch.utils.data import DataLoader
from datetime import datetime
import sys
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
################# Download AskUbuntu and extract training corpus #################
askubuntu_folder = "data/askubuntu"
result_folder = "output/askubuntu-tsdae-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
batch_size = 8
## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in ["text_tokenized.txt.gz", "dev.txt", "test.txt", "train_random.txt"]:
filepath = os.path.join(askubuntu_folder, filename)
if not os.path.exists(filepath):
util.http_get("https://github.com/taolei87/askubuntu/raw/master/" + filename, filepath)
# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, "text_tokenized.txt.gz"), "rt", encoding="utf8") as fIn:
for line in fIn:
splits = line.strip().split("\t")
id = splits[0]
title = splits[1]
corpus[id] = title
# Read dev & test dataset
def read_eval_dataset(filepath):
dataset = []
with open(filepath) as fIn:
for line in fIn:
query_id, relevant_id, candidate_ids, bm25_scores = line.strip().split("\t")
if len(relevant_id) == 0: # Skip examples without relevant entries
continue
relevant_id = relevant_id.split(" ")
candidate_ids = candidate_ids.split(" ")
negative_ids = set(candidate_ids) - set(relevant_id)
dataset.append(
{
"query": corpus[query_id],
"positive": [corpus[pid] for pid in relevant_id],
"negative": [corpus[pid] for pid in negative_ids],
}
)
dev_test_ids.add(query_id)
dev_test_ids.update(candidate_ids)
return dataset
dev_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "dev.txt"))
test_dataset = read_eval_dataset(os.path.join(askubuntu_folder, "test.txt"))
## Now we need a list of train sentences.
## In this example we simply use all sentences that don't appear in the train/dev set
train_sentences = []
for id, sentence in corpus.items():
if id not in dev_test_ids:
train_sentences.append(sentence)
logging.info("{} train sentences".format(len(train_sentences)))
################# Initialize an SBERT model #################
model_name = sys.argv[1] if len(sys.argv) >= 2 else "bert-base-uncased"
word_embedding_model = models.Transformer(model_name)
# Apply **cls** pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
# Create a dev evaluator
dev_evaluator = evaluation.RerankingEvaluator(dev_dataset, name="AskUbuntu dev")
logging.info("Dev performance before training")
dev_evaluator(model)
total_steps = 20000
logging.info("Start training")
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
evaluation_steps=1000,
epochs=1,
steps_per_epoch=total_steps,
weight_decay=0,
scheduler="constantlr",
optimizer_params={"lr": 3e-5},
output_path=result_folder,
show_progress_bar=True,
)
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Training parameters
model_name = "bert-base-uncased"
train_batch_size = 8
num_epochs = 1
max_seq_length = 75
# Save path to store our model
model_save_path = "output/training_stsb_tsdae-{}-{}-{}".format(
model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# Defining our sentence transformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = "data/wiki1m_for_simcse.txt"
if not os.path.exists(wikipedia_dataset_path):
util.http_get(
"https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt",
wikipedia_dataset_path,
)
# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
train_sentences = []
with open(wikipedia_dataset_path, "r", encoding="utf8") as fIn:
for line in fIn:
line = line.strip()
if len(line) >= 10:
train_sentences.append(line)
# Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
if row["split"] == "dev":
dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
elif row["split"] == "test":
test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
dev_samples, batch_size=train_batch_size, name="sts-dev"
)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
test_samples, batch_size=train_batch_size, name="sts-test"
)
# We train our model using the MultipleNegativesRankingLoss
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, drop_last=True)
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
evaluation_steps = 1000
logging.info("Training sentences: {}".format(len(train_sentences)))
logging.info("Performance before training")
dev_evaluator(model)
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=evaluation_steps,
output_path=model_save_path,
weight_decay=0,
warmup_steps=100,
optimizer_params={"lr": 3e-5},
use_amp=True, # Set to True, if your GPU supports FP16 cores
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator(model, output_path=model_save_path)
"""
This file loads sentences from a provided text file. It is expected, that the there is one sentence per line in that text file.
TSDAE will be training using these sentences. Checkpoints are stored every 500 steps to the output folder.
Usage:
python train_tsdae_from_file.py path/to/sentences.txt
"""
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, datasets, losses
import logging
import gzip
from torch.utils.data import DataLoader
from datetime import datetime
import sys
import tqdm
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Train Parameters
model_name = "bert-base-uncased"
batch_size = 8
# Input file path (a text file, each line a sentence)
if len(sys.argv) < 2:
print("Run this script with: python {} path/to/sentences.txt".format(sys.argv[0]))
exit()
filepath = sys.argv[1]
# Save path to store our model
output_name = ""
if len(sys.argv) >= 3:
output_name = "-" + sys.argv[2].replace(" ", "_").replace("/", "_").replace("\\", "_")
model_output_path = "output/train_tsdae{}-{}".format(output_name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
################# Read the train corpus #################
train_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
for line in tqdm.tqdm(fIn, desc="Read file"):
line = line.strip()
if len(line) >= 10:
train_sentences.append(line)
logging.info("{} train sentences".format(len(train_sentences)))
################# Initialize an SBERT model #################
word_embedding_model = models.Transformer(model_name)
# Apply **cls** pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
################# Train and evaluate the model (it needs about 1 hour for one epoch of AskUbuntu) #################
# We wrap our training sentences in the DenoisingAutoEncoderDataset to add deletion noise on the fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
logging.info("Start training")
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=1,
weight_decay=0,
scheduler="constantlr",
optimizer_params={"lr": 3e-5},
show_progress_bar=True,
checkpoint_path=model_output_path,
use_amp=False, # Set to True, if your GPU supports FP16 cores
)
"""
In this example we train a semantic search model to search through Wikipedia
articles about programming articles & technologies.
We use the text paragraphs from the following Wikipedia articles:
Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
In:
1_programming_query_generation.py - We generate queries for all paragraphs from these articles
2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
"""
import json
import gzip
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import tqdm
import os
from sentence_transformers import util
paragraphs = set()
# We use the Wikipedia articles of certain programming languages
corpus_filepath = "wiki-programmming-20210101.jsonl.gz"
if not os.path.exists(corpus_filepath):
util.http_get("https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz", corpus_filepath)
with gzip.open(corpus_filepath, "rt") as fIn:
for line in fIn:
data = json.loads(line.strip())
for p in data["paragraphs"]:
if len(p) > 100: # Only take paragraphs with at least 100 chars
paragraphs.add(p)
paragraphs = list(paragraphs)
print("Paragraphs:", len(paragraphs))
# Now we load the model that is able to generate queries given a paragraph.
# This model was trained on the MS MARCO dataset, a dataset with 500k
# queries from Bing and the respective relevant passage
tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
model.eval()
# Select the device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Parameters for generation
batch_size = 8 # Batch size
num_queries = 5 # Number of queries to generate for every paragraph
max_length_paragraph = 300 # Max length for paragraph
max_length_query = 64 # Max length for output query
# Now for every paragraph in our corpus, we generate the queries
with open("generated_queries.tsv", "w") as fOut:
for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
sub_paragraphs = paragraphs[start_idx : start_idx + batch_size]
inputs = tokenizer.prepare_seq2seq_batch(
sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors="pt"
).to(device)
outputs = model.generate(
**inputs, max_length=max_length_query, do_sample=True, top_p=0.95, num_return_sequences=num_queries
)
for idx, out in enumerate(outputs):
query = tokenizer.decode(out, skip_special_tokens=True)
para = sub_paragraphs[int(idx / num_queries)]
fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))
"""
In this example we train a semantic search model to search through Wikipedia
articles about programming articles & technologies.
We use the text paragraphs from the following Wikipedia articles:
Assembly language, C , C Sharp , C++, Go , Java , JavaScript, Keras, Laravel, MATLAB, Matplotlib, MongoDB, MySQL, Natural Language Toolkit, NumPy, pandas (software), Perl, PHP, PostgreSQL, Python , PyTorch, R , React, Rust , Scala , scikit-learn, SciPy, Swift , TensorFlow, Vue.js
In:
1_programming_query_generation.py - We generate queries for all paragraphs from these articles
2_programming_train_bi-encoder.py - We train a SentenceTransformer bi-encoder with these generated queries. This results in a model we can then use for semantic search (for the given Wikipedia articles).
3_programming_semantic_search.py - Shows how the trained model can be used for semantic search
"""
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
import os
train_examples = []
with open("generated_queries.tsv") as fIn:
for line in fIn:
query, paragraph = line.strip().split("\t", maxsplit=1)
train_examples.append(InputExample(texts=[query, paragraph]))
# For the MultipleNegativesRankingLoss, it is important
# that the batch does not contain duplicate entries, i.e.
# no two equal queries and no two equal paragraphs.
# To ensure this, we use a special data loader
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=64)
# Now we create a SentenceTransformer model from scratch
word_emb = models.Transformer("distilbert-base-uncased")
pooling = models.Pooling(word_emb.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_emb, pooling])
# MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
# and trains the model so that is is suitable for semantic search
train_loss = losses.MultipleNegativesRankingLoss(model)
# Tune the model
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=num_epochs,
warmup_steps=warmup_steps,
show_progress_bar=True,
)
os.makedirs("output", exist_ok=True)
model.save("output/programming-model")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment