""" This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It uses MatryoshkaLoss with the powerful CoSENTLoss to train models that perform well at output dimensions [768, 512, 256, 128, 64]. It generates sentence embeddings that can be compared using cosine-similarity to measure the similarity. Usage: python matryoshka_sts.py OR python matryoshka_sts.py pretrained_transformer_model_name """ from torch.utils.data import DataLoader import math from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import InputExample import logging from datetime import datetime import sys import os import gzip import csv #### Just some code to print debug information to stdout logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()] ) #### /print debug information to stdout # Check if dataset exists. If not, download and extract it sts_dataset_path = "datasets/stsbenchmark.tsv.gz" if not os.path.exists(sts_dataset_path): util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path) # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased" # Read the dataset train_batch_size = 16 num_epochs = 4 model_save_path = ( "output/matryoshka_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") ) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, ) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn: reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE) for row in reader: score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score) if row["split"] == "dev": dev_samples.append(inp_example) elif row["split"] == "test": test_samples.append(inp_example) else: train_samples.append(inp_example) train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CoSENTLoss(model=model) train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64]) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev") # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path, ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test") test_evaluator(model, output_path=model_save_path) # Optionally, save the model to the Hugging Face Hub! # It is recommended to run `huggingface-cli login` to log into your Hugging Face account first model_name = model_name if "/" not in model_name else model_name.split("/")[-1] try: model.push_to_hub(f"{model_name}-sts-matryoshka") except Exception: logging.error( "Error uploading model to the Hugging Face Hub. To upload it manually, you can run " f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` " f"and saving it using `model.push_to_hub('{model_name}-sts-matryoshka')`." )