Commit 0fccd232 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add

parents
Pipeline #1027 failed with stages
in 0 seconds
"""
This example uses Approximate Nearest Neighbor Search (ANN) with FAISS (https://github.com/facebookresearch/faiss).
Searching a large corpus with Millions of embeddings can be time-consuming. To speed this up,
ANN can index the existent vectors. For a new query vector, this index can be used to find the nearest neighbors.
This nearest neighbor search is not perfect, i.e., it might not perfectly find all top-k nearest neighbors.
In this example, we use FAISS with an inverse flat index (IndexIVFFlat). It learns to partition the corpus embeddings
into different cluster (number is defined by n_clusters). At search time, the matching cluster for query is found and only vectors
in this cluster must be search for nearest neighbors.
This script will compare the result from ANN with exact nearest neighbor search and output a Recall@k value
as well as the missing results in the top-k hits list.
See the FAISS repository, how to install FAISS.
As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (only 100k are used):
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs.
As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
return the closest questions in the corpus (questions in the corpus are mainly in English).
"""
from sentence_transformers import SentenceTransformer, util
import os
import csv
import pickle
import time
import faiss
import numpy as np
model_name = "quora-distilbert-multilingual"
model = SentenceTransformer(model_name)
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 100000
embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
embedding_size = 768 # Size of embeddings
top_k_hits = 10 # Output k hits
# Defining our FAISS index
# Number of clusters used for faiss. Select a value 4*sqrt(N) to 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
n_clusters = 1024
# We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
quantizer = faiss.IndexFlatIP(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT)
# Number of clusters to explorer at search time. We will search for nearest neighbors in 3 clusters.
index.nprobe = 3
# Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
print("Download dataset")
util.http_get(url, dataset_path)
# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
for row in reader:
corpus_sentences.add(row["question1"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences.add(row["question2"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)
print("Store file on disc")
with open(embedding_cache_path, "wb") as fOut:
pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
else:
print("Load pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
cache_data = pickle.load(fIn)
corpus_sentences = cache_data["sentences"]
corpus_embeddings = cache_data["embeddings"]
### Create the FAISS index
print("Start creating FAISS index")
# First, we need to normalize vectors to unit length
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]
# Then we train the index to find a suitable clustering
index.train(corpus_embeddings)
# Finally we add all embeddings to the index
index.add(corpus_embeddings)
######### Search in the index ###########
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
while True:
inp_question = input("Please enter a question: ")
start_time = time.time()
question_embedding = model.encode(inp_question)
# FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity
question_embedding = question_embedding / np.linalg.norm(question_embedding)
question_embedding = np.expand_dims(question_embedding, axis=0)
# Search in FAISS. It returns a matrix with distances and corpus ids.
distances, corpus_ids = index.search(question_embedding, top_k_hits)
# We extract corpus ids and scores for the first query
hits = [{"corpus_id": id, "score": score} for id, score in zip(corpus_ids[0], distances[0])]
hits = sorted(hits, key=lambda x: x["score"], reverse=True)
end_time = time.time()
print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
for hit in hits[0:top_k_hits]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
# Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
# Here, we compute the recall of ANN compared to the exact results
correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
correct_hits_ids = set([hit["corpus_id"] for hit in correct_hits])
ann_corpus_ids = set([hit["corpus_id"] for hit in hits])
if len(ann_corpus_ids) != len(correct_hits_ids):
print("Approximate Nearest Neighbor returned a different number of results than expected")
recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
if recall < 1:
print("Missing results:")
for hit in correct_hits[0:top_k_hits]:
if hit["corpus_id"] not in ann_corpus_ids:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
print("\n\n========\n")
"""
This example uses Approximate Nearest Neighbor Search (ANN) with Hnswlib (https://github.com/nmslib/hnswlib/).
Searching a large corpus with Millions of embeddings can be time-consuming. To speed this up,
ANN can index the existent vectors. For a new query vector, this index can be used to find the nearest neighbors.
This nearest neighbor search is not perfect, i.e., it might not perfectly find all top-k nearest neighbors.
In this example, we use Hnswlib: It is a fast and easy to use library, with excellent results on common benchmarks.
Usually you can install Hnswlib by running:
pip install hnswlib
For more details, see https://github.com/nmslib/hnswlib/
As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (we only use 100k in this example):
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
return the closest questions in the corpus (questions in the corpus are mainly in English).
"""
from sentence_transformers import SentenceTransformer, util
import os
import csv
import pickle
import time
import hnswlib
model_name = "quora-distilbert-multilingual"
model = SentenceTransformer(model_name)
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 100000
embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
embedding_size = 768 # Size of embeddings
top_k_hits = 10 # Output k hits
# Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
print("Download dataset")
util.http_get(url, dataset_path)
# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
for row in reader:
corpus_sentences.add(row["question1"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences.add(row["question2"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)
print("Store file on disc")
with open(embedding_cache_path, "wb") as fOut:
pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
else:
print("Load pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
cache_data = pickle.load(fIn)
corpus_sentences = cache_data["sentences"]
corpus_embeddings = cache_data["embeddings"]
# Defining our hnswlib index
index_path = "./hnswlib.index"
# We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
index = hnswlib.Index(space="cosine", dim=embedding_size)
if os.path.exists(index_path):
print("Loading index...")
index.load_index(index_path)
else:
### Create the HNSWLIB index
print("Start creating HNSWLIB index")
index.init_index(max_elements=len(corpus_embeddings), ef_construction=400, M=64)
# Then we train the index to find a suitable clustering
index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))
print("Saving index to:", index_path)
index.save_index(index_path)
# Controlling the recall by setting ef:
index.set_ef(50) # ef should always be > top_k_hits
######### Search in the index ###########
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
while True:
inp_question = input("Please enter a question: ")
start_time = time.time()
question_embedding = model.encode(inp_question)
# We use hnswlib knn_query method to find the top_k_hits
corpus_ids, distances = index.knn_query(question_embedding, k=top_k_hits)
# We extract corpus ids and scores for the first query
hits = [{"corpus_id": id, "score": 1 - score} for id, score in zip(corpus_ids[0], distances[0])]
hits = sorted(hits, key=lambda x: x["score"], reverse=True)
end_time = time.time()
print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
for hit in hits[0:top_k_hits]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
# Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
# Here, we compute the recall of ANN compared to the exact results
correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
correct_hits_ids = set([hit["corpus_id"] for hit in correct_hits])
ann_corpus_ids = set([hit["corpus_id"] for hit in hits])
if len(ann_corpus_ids) != len(correct_hits_ids):
print("Approximate Nearest Neighbor returned a different number of results than expected")
recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
if recall < 1:
print("Missing results:")
for hit in correct_hits[0:top_k_hits]:
if hit["corpus_id"] not in ann_corpus_ids:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
print("\n\n========\n")
"""
This script contains an example how to perform semantic search with PyTorch. It performs exact nearest neighborh search.
As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (we only use about 100k):
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
return the closest questions in the corpus (questions in the corpus are mainly in English).
Google Colab example: https://colab.research.google.com/drive/12cn5Oo0v3HfQQ8Tv6-ukgxXSmT3zl35A?usp=sharing
"""
from sentence_transformers import SentenceTransformer, util
import os
import csv
import pickle
import time
model_name = "quora-distilbert-multilingual"
model = SentenceTransformer(model_name)
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 100000
embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
# Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
print("Download dataset")
util.http_get(url, dataset_path)
# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
for row in reader:
corpus_sentences.add(row["question1"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences.add(row["question2"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)
print("Store file on disc")
with open(embedding_cache_path, "wb") as fOut:
pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
else:
print("Load pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
cache_data = pickle.load(fIn)
corpus_sentences = cache_data["sentences"][0:max_corpus_size]
corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]
###############################
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
# Move embeddings to the target device of the model
corpus_embeddings = corpus_embeddings.to(model.device)
while True:
inp_question = input("Please enter a question: ")
start_time = time.time()
question_embedding = model.encode(inp_question, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, corpus_embeddings)
end_time = time.time()
hits = hits[0] # Get the hits for the first query
print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
print("\n\n========\n")
"""
This examples demonstrates the setup for Question-Answer-Retrieval.
You can input a query or a question. The script then uses semantic search
to find relevant passages in Simple English Wikipedia (as it is smaller and fits better in RAM).
As model, we use: nq-distilbert-base-v1
It was trained on the Natural Questions dataset, a dataset with real questions from Google Search
together with annotated data from Wikipedia providing the answer. For the passages, we encode the
Wikipedia article tile together with the individual text passages.
Google Colab Example: https://colab.research.google.com/drive/11GunvCqJuebfeTlgbJWkIMT0xJH6PWF1?usp=sharing
"""
import json
from sentence_transformers import SentenceTransformer, util
import time
import gzip
import os
import torch
# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
model_name = "nq-distilbert-base-v1"
bi_encoder = SentenceTransformer(model_name)
top_k = 5 # Number of passages we want to retrieve with the bi-encoder
# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder
wikipedia_filepath = "data/simplewiki-2020-11-01.jsonl.gz"
if not os.path.exists(wikipedia_filepath):
util.http_get("http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz", wikipedia_filepath)
passages = []
with gzip.open(wikipedia_filepath, "rt", encoding="utf8") as fIn:
for line in fIn:
data = json.loads(line.strip())
for paragraph in data["paragraphs"]:
# We encode the passages as [title, text]
passages.append([data["title"], paragraph])
# If you like, you can also limit the number of passages you want to use
print("Passages:", len(passages))
# To speed things up, pre-computed embeddings are downloaded.
# The provided file encoded the passages with the model 'nq-distilbert-base-v1'
if model_name == "nq-distilbert-base-v1":
embeddings_filepath = "simplewiki-2020-11-01-nq-distilbert-base-v1.pt"
if not os.path.exists(embeddings_filepath):
util.http_get("http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt", embeddings_filepath)
corpus_embeddings = torch.load(embeddings_filepath)
corpus_embeddings = corpus_embeddings.float() # Convert embedding file to float
device = util.get_device_name()
corpus_embeddings = corpus_embeddings.to(device)
else: # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
while True:
query = input("Please enter a question: ")
# Encode the query using the bi-encoder and find potentially relevant passages
start_time = time.time()
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
end_time = time.time()
# Output of top-k hits
print("Input question:", query)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
for hit in hits:
print("\t{:.3f}\t{}".format(hit["score"], passages[hit["corpus_id"]]))
print("\n\n========\n")
"""
LexRank implementation
Source: https://github.com/crabcamp/lexrank/tree/dev
"""
import numpy as np
from scipy.sparse.csgraph import connected_components
from scipy.special import softmax
import logging
logger = logging.getLogger(__name__)
def degree_centrality_scores(
similarity_matrix,
threshold=None,
increase_power=True,
):
if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
raise ValueError(
"'threshold' should be a floating-point number " "from the interval [0, 1) or None",
)
if threshold is None:
markov_matrix = create_markov_matrix(similarity_matrix)
else:
markov_matrix = create_markov_matrix_discrete(
similarity_matrix,
threshold,
)
scores = stationary_distribution(
markov_matrix,
increase_power=increase_power,
normalized=False,
)
return scores
def _power_method(transition_matrix, increase_power=True, max_iter=10000):
eigenvector = np.ones(len(transition_matrix))
if len(eigenvector) == 1:
return eigenvector
transition = transition_matrix.transpose()
for _ in range(max_iter):
eigenvector_next = np.dot(transition, eigenvector)
if np.allclose(eigenvector_next, eigenvector):
return eigenvector_next
eigenvector = eigenvector_next
if increase_power:
transition = np.dot(transition, transition)
logger.warning("Maximum number of iterations for power method exceeded without convergence!")
return eigenvector_next
def connected_nodes(matrix):
_, labels = connected_components(matrix)
groups = []
for tag in np.unique(labels):
group = np.where(labels == tag)[0]
groups.append(group)
return groups
def create_markov_matrix(weights_matrix):
n_1, n_2 = weights_matrix.shape
if n_1 != n_2:
raise ValueError("'weights_matrix' should be square")
row_sum = weights_matrix.sum(axis=1, keepdims=True)
# normalize probability distribution differently if we have negative transition values
if np.min(weights_matrix) <= 0:
return softmax(weights_matrix, axis=1)
return weights_matrix / row_sum
def create_markov_matrix_discrete(weights_matrix, threshold):
discrete_weights_matrix = np.zeros(weights_matrix.shape)
ixs = np.where(weights_matrix >= threshold)
discrete_weights_matrix[ixs] = 1
return create_markov_matrix(discrete_weights_matrix)
def stationary_distribution(
transition_matrix,
increase_power=True,
normalized=True,
):
n_1, n_2 = transition_matrix.shape
if n_1 != n_2:
raise ValueError("'transition_matrix' should be square")
distribution = np.zeros(n_1)
grouped_indices = connected_nodes(transition_matrix)
for group in grouped_indices:
t_matrix = transition_matrix[np.ix_(group, group)]
eigenvector = _power_method(t_matrix, increase_power=increase_power)
distribution[group] = eigenvector
if normalized:
distribution /= n_1
return distribution
# Text Summarization
SentenceTransformers can be used for (extractive) text summarization: The document is broken down into sentences and embedded by SentenceTransformers. Then, we can compute the cosine similarity across all possible sentence combinations.
We then use [LexRank](https://www.aaai.org/Papers/JAIR/Vol22/JAIR-2214.pdf) to find the most central sentences in the document. These central sentences form a good basis for a summarization of the document.
An example is shown in [text-summarization.py](text-summarization.py)
\ No newline at end of file
"""
This example uses LexRank (https://www.aaai.org/Papers/JAIR/Vol22/JAIR-2214.pdf)
to create an extractive summarization of a long document.
The document is split into sentences using NLTK, then the sentence embeddings are computed. We
then compute the cosine-similarity across all possible sentence pairs.
We then use LexRank to find the most central sentences in the document, which form our summary.
Input document: First section from the English Wikipedia Section
Output summary:
Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass.
New York City (NYC), often called simply New York, is the most populous city in the United States.
Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports.
If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world.
Note: Requires NLTK: `pip install nltk`
"""
import nltk
from sentence_transformers import SentenceTransformer, util
import numpy as np
from LexRank import degree_centrality_scores
model = SentenceTransformer("all-MiniLM-L6-v2")
# Our input document we want to summarize
# As example, we take the first section from Wikipedia
document = """
New York City (NYC), often called simply New York, is the most populous city in the United States. With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass. With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the world's most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.
Situated on one of the world's largest natural harbors, New York City is composed of five boroughs, each of which is a county of the State of New York. The five boroughs—Brooklyn, Queens, Manhattan, the Bronx, and Staten Island—were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world as of 2016. As of 2019, the New York metropolitan area is estimated to produce a gross metropolitan product (GMP) of $2.0 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York is home to the highest number of billionaires of any city in the world.
New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was subsequently renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity, entrepreneurship, and environmental sustainability, and as a symbol of freedom and cultural diversity. In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity.
Many districts and landmarks in New York City are well known, including three of the world's ten most visited tourist attractions in 2013. A record 62.8 million tourists visited New York City in 2017. Times Square is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. Many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. Providing continuous 24/7 service and contributing to the nickname The City that Never Sleeps, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. The city has over 120 colleges and universities, including Columbia University, New York University, Rockefeller University, and the City University of New York system, which is the largest urban public university system in the United States. Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
"""
# Split the document into sentences
sentences = nltk.sent_tokenize(document)
print("Num sentences:", len(sentences))
# Compute the sentence embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)
# Compute the pair-wise cosine similarities
cos_scores = util.cos_sim(embeddings, embeddings).numpy()
# Compute the centrality for each sentence
centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
# We argsort so that the first element is the sentence with the highest score
most_central_sentence_indices = np.argsort(-centrality_scores)
# Print the 5 sentences with the highest scores
print("\n\nSummary:")
for idx in most_central_sentence_indices[0:5]:
print(sentences[idx].strip())
# Domain Adaptation
The goal of **Domain Adaptation** is to adapt text embedding models to your specific text domain without the need to have labeled training data.
Domain adaptation is still an active research field and there exists no perfect solution yet. However, in our two recent papers [TSDAE](https://arxiv.org/abs/2104.06979) and [GPL](https://arxiv.org/abs/2112.07577) we evaluated several methods how text embeddings model can be adapted to your specific domain. You can find an overview of these methods in my [talk on unsupervised domain adaptation](https://youtu.be/xbdLowiQTlk).
## Domain Adaptation vs. Unsupervised Learning
There exists methods for [unsupervised text embedding learning](../unsupervised_learning/README.md), however, they generally perform rather badly: They are not really able to learn domain specific concepts.
A much better approach is domain adaptation: Here you have an unlabeled corpus from your specific domain together with an existing labeled corpus. You can find many suitable labeled training datasets here: [embedding-training-data](https://huggingface.co/datasets/sentence-transformers/embedding-training-data)
## Adaptive Pre-Training
When using adaptive pre-training, you first pre-train on your target corpus using e.g. [Masked Language Modeling](../unsupervised_learning/MLM/README.md) or [TSDAE](../unsupervised_learning/TSDAE/README.md) and then you fine-tune on an existing training dataset (see [embedding-training-data](https://huggingface.co/datasets/sentence-transformers/embedding-training-data)).
![Adaptive Pre-Training](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/adaptive_pre-training.png)
In our paper [TSDAE](https://arxiv.org/abs/2104.06979) we evaluated several methods for domain adaptation on 4 domain specific sentence embedding tasks:
| Approach | AskUbuntu | CQADupStack | Twitter | SciDocs | Avg |
| -------- | :-------: | :---------: | :-----: | :-----: | :---: |
| Zero-Shot Model | 54.5 | 12.9 | 72.2 | 69.4 | 52.3 |
| TSDAE | 59.4 | **14.4** | **74.5** | **77.6** | **56.5** |
| MLM | **60.6** | 14.3 | 71.8 | 76.9 | 55.9 |
| CT | 56.4 | 13.4 | 72.4 | 69.7 | 53.0 |
| SimCSE | 56.2 | 13.1 | 71.4 | 68.9 | 52.4 |
As we can see, the performance can improve up-to 8 points when you first perform pre-training on your specific corpus and then fine-tune on provided labeled training data.
In [GPL](https://arxiv.org/abs/2112.07577) we evaluate these methods for semantic search: Given a short query, find the relevant passage. Here, performance can improve up to 10 points:
| Approach | FiQA | SciFact | BioASQ | TREC-COVID | CQADupStack | Robust04 | Avg |
| -------- | :---: | :-----: | :----: | :--------: | :---------: | :------: | :---: |
| Zero-Shot Model | 26.7 | 57.1 | 52.9 | 66.1 | 29.6 | 39.0 | 45.2 |
| TSDAE | 29.3 | **62.8** | **55.5** | **76.1** | **31.8** | **39.4** | **49.2** |
| MLM | **30.2** | 60.0 | 51.3 | 69.5 | 30.4 | 38.8 | 46.7 |
| ICT | 27.0 | 58.3 | 55.3 | 69.7 | 31.3 | 37.4 | 46.5 |
| SimCSE | 26.7 | 55.0 | 53.2 | 68.3 | 29.0 | 37.9 | 45.0 |
| CD | 27.0 | 62.7 | 47.7 | 65.4 | 30.6 | 34.5 | 44.7 |
| CT | 28.3 | 55.6 | 49.9 | 63.8 | 30.5 | 35.9 | 44.0 |
A big **disadvantage of adaptive pre-training** is the high computational overhead, as you must first run pre-training on your corpus and then supervised learning on a labeled training dataset. The labeled trained datasets can be quite large (e.g. the `all-*-v1` models had been trained on over 1 billion training pairs).
## GPL: Generative Pseudo-Labeling
[GPL](https://arxiv.org/abs/2112.07577) overcomes the aforementioned issue: It can be applied on-top of a fine-tuned model. Hence, you can use one of the [pre-trained models](https://www.sbert.net/docs/pretrained_models.html) and adapt it to your specific domain:
![GPL_Overview](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_overview.png)
The longer you train, the better your model gets. In our experiments, we were training the models for about 1 day on a V100-GPU. GPL can be combined with adaptive pre-training, which can give another performance boost.
![GPL_Steps](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_steps.png)
### GPL Steps
GPL works in three phases:
![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_architecture.png)
- **Query Generation**: For a given text from our domain, we first use a T5 model that generates a possible query for the given text. E.g. when your text is *"Python is a high-level general-purpose programming language"*, the model might generate a query like *"What is Python"*. You can find various query generators on our [doc2query-hub](https://huggingface.co/doc2query).
- **Negative Mining**: Next, for the generate query *"What is Python"* we mine negative passages from our corpus, i.e. passages that are similar to the query but don't which a user would not consider relevant. Such a negative passage could be *"Java is a high-level, class-based, object-oriented programming language."*. We do this mining using dense retrieval, i.e. we use one of the existing text embedding models and retrieve relevant paragraphs for the given query.
- **Pseudo Labeling**: It might be that in the negative mining step we retrieve a passage that is actually relevant for the query (like another definition for *"What is Python"*). To overcome this issue, we use a [Cross-Encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) to score all (query, passage)-pairs.
- **Training**: Once we have the triplets *(generated query, positive passage, mined negative passage)* and the Cross-Encoder scores for *(query, positive)* and *(query, negative)* we can start training the text embedding model using [MarginMSELoss](https://www.sbert.net/docs/package_reference/losses.html#marginmseloss).
The **pseudo labeling** step is quite important and which results in the increased performance compared to the previous method QGen, which treated passages just as positive (1) or negative (0). As we see in the following picture, for a generate query (*"what is futures contract"*), the negative mining step retrieves passages that are partly or highly relevant to the generated query. Using MarginMSELoss and the Cross-Encoder, we can identify these passages and teach the text embedding model that these passages are also relevant for the given query.
![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_negatives.jpg)
The following tables gives an overview of GPL in comparison to adaptive pre-training (MLM and TSDAE). As mentioned, GPL can be combined with adaptive pre-training.
| Approach | FiQA | SciFact | BioASQ | TREC-COVID | CQADupStack | Robust04 | Avg |
| -------- | :---: | :-----: | :----: | :--------: | :---------: | :------: | :---: |
| Zero-Shot model | 26.7 | 57.1 | 52.9 | 66.1 | 29.6 | 39.0 | 45.2 |
| TSDAE + GPL | **33.3** | **67.3** | **62.8** | 74.0 | **35.1** | **42.1** | **52.4** |
| GPL | 33.1 | 65.2 | 61.6 | 71.7 | 34.4 | **42.1** | 51.4 |
| TSDAE | 29.3 | 62.8 | 55.5 | **76.1** | 31.8 | 39.4 | 49.2 |
| MLM | 30.2 | 60.0 | 51.3 | 69.5 | 30.4 | 38.8 | 46.7 |
### GPL Code
You can find the code for GPL here: [https://github.com/UKPLab/gpl](https://github.com/UKPLab/gpl)
We made the code simple to use, so that you just need to pass your corpus and everything else is handled by the training code.
### Citation
If you find these resources helpful, feel free to cite our papers.
[TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning](https://arxiv.org/abs/2104.06979)
```bibtex
@inproceedings{wang-2021-TSDAE,
title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
author = "Wang, Kexin and Reimers, Nils and Gurevych, Iryna",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
pages = "671--688",
url = "https://arxiv.org/abs/2104.06979",
}
```
[GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577):
```bibtex
@inproceedings{wang-2021-GPL,
title = "GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval",
author = "Wang, Kexin and Thakur, Nandan and Reimers, Nils and Gurevych, Iryna",
journal= "arXiv preprint arXiv:2112.07577",
month = "12",
year = "2021",
url = "https://arxiv.org/abs/2112.07577",
}
```
"""
This examples measures the inference speed of a certain model
Usage:
python evaluation_inference_speed.py
OR
python evaluation_inference_speed.py model_name
"""
from sentence_transformers import SentenceTransformer, util
import sys
import os
import time
import torch
import gzip
import csv
# Limit torch to 4 threads
torch.set_num_threads(4)
model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-nli-mean-tokens"
# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)
nli_dataset_path = "datasets/AllNLI.tsv.gz"
sentences = set()
max_sentences = 100000
# Download datasets if needed
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
sentences.add(row["sentence1"])
if len(sentences) >= max_sentences:
break
sentences = list(sentences)
print("Model Name:", model_name)
print("Number of sentences:", len(sentences))
for i in range(3):
print("Run", i)
start_time = time.time()
emb = model.encode(sentences, batch_size=32)
end_time = time.time()
diff_time = end_time - start_time
print("Done after {:.2f} seconds".format(diff_time))
print("Speed: {:.2f} sentences / second".format(len(sentences) / diff_time))
print("=====")
"""
This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
Usage:
python evaluation_stsbenchmark.py
OR
python evaluation_stsbenchmark.py model_name
"""
from sentence_transformers import SentenceTransformer, util, LoggingHandler, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
import sys
import torch
import gzip
import os
import csv
script_folder_path = os.path.dirname(os.path.realpath(__file__))
# Limit torch to 4 threads
torch.set_num_threads(4)
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = sys.argv[1] if len(sys.argv) > 1 else "stsb-distilroberta-base-v2"
# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)
sts_dataset_path = "data/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
model.evaluate(evaluator)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
model.evaluate(evaluator)
"""
Given a tab separated file (.tsv) with parallel sentences, where the second column is the translation of the sentence in the first column, for example, in the format:
src1 trg1
src2 trg2
...
where trg_i is the translation of src_i.
Given src_i, the TranslationEvaluator checks which trg_j has the highest similarity using cosine similarity. If i == j, we assume
a match, i.e., the correct translation has been found for src_i out of all possible target sentences.
It then computes an accuracy over all possible source sentences src_i. Equivalently, it computes also the accuracy for the other direction.
A high accuracy score indicates that the model is able to find the correct translation out of a large pool with sentences.
Usage:
python [model_name_or_path] [parallel-file1] [parallel-file2] ...
For example:
python distiluse-base-multilingual-cased talks-en-de.tsv.gz
See the training_multilingual/get_parallel_data_...py scripts for getting parallel sentence data from different sources
"""
from sentence_transformers import SentenceTransformer, evaluation, LoggingHandler
import sys
import gzip
import os
import logging
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
model_name = sys.argv[1]
filepaths = sys.argv[2:]
inference_batch_size = 32
model = SentenceTransformer(model_name)
for filepath in filepaths:
src_sentences = []
trg_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, "r", encoding="utf8"
) as fIn:
for line in fIn:
splits = line.strip().split("\t")
if len(splits) >= 2:
src_sentences.append(splits[0])
trg_sentences.append(splits[1])
logger.info(os.path.basename(filepath) + ": " + str(len(src_sentences)) + " sentence pairs")
dev_trans_acc = evaluation.TranslationEvaluator(
src_sentences, trg_sentences, name=os.path.basename(filepath), batch_size=inference_batch_size
)
dev_trans_acc(model)
# Training
This folder contains various examples to fine-tune `SentenceTransformers` for specific tasks.
For the beginning, I can recommend to have a look at the Semantic Textual Similarity ([STS](sts/)) or the Natural Language Inference ([NLI](nli/)) examples.
For the documentation how to train your own models, see [Training Overview](http://www.sbert.net/docs/training/overview.html).
## Training Examples
- [avg_word_embeddings](avg_word_embeddings/) - This folder contains examples to train models based on classical word embeddings like GloVe. These models are extremely fast, but are a more inaccuracte than transformers based models.
- [distillation](distillation/) - Examples to make models smaller, faster and lighter.
- [multilingual](multilingual/) - Existent monolingual models can be extend to various languages ([paper](https://arxiv.org/abs/2004.09813)). This folder contains a step-by-step guide to extend existent models to new languages.
- [nli](nli/) - Natural Language Inference (NLI) data can be quite helpful to pre-train and fine-tune models to create meaningful sentence embeddings.
- [quora_duplicate_questions](quora_duplicate_questions/) - Quora Duplicate Questions is large set corpus with duplicate questions from the Quora community. The folder contains examples how to train models for duplicate questions mining and for semantic search.
- [sts](sts/) - The most basic method to train models is using Semantic Textual Similarity (STS) data. Here, we have a sentence pair and a score indicating the semantic similarity.
- [other](other/) - Various tiny examples for show-casing one specific training case.
# Adaptive Layers
Embedding models are often encoder models with numerous layers, such as 12 (e.g. [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)) or 6 (e.g. [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)). To get embeddings, every single one of these layers must be traversed. [2D Matryoshka Sentence Embeddings](https://arxiv.org/abs/2402.14776) (2DMSE) revisits this concept by proposing an approach to train embedding models that will perform well when only using a selection of all layers. This results in faster inference speeds at relatively low performance costs.
## Use Cases
The 2DMSE paper mentions that using a few layers of a larger model trained using Adaptive Layers and Matryoshka Representation Learning can outperform a smaller model that was trained like a standard embedding model.
## Results
Let's look at the performance that we may be able to expect from an Adaptive Layer embedding model versus a regular embedding model. For this experiment, I have trained two models:
* [tomaarsen/mpnet-base-nli-adaptive-layer](https://huggingface.co/tomaarsen/mpnet-base-nli-adaptive-layer): Trained by running [adaptive_layer_nli.py](adaptive_layer_nli.py) with [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base).
* [tomaarsen/mpnet-base-nli](https://huggingface.co/tomaarsen/mpnet-base-nli): A near identical model as the former, but using only `MultipleNegativesRankingLoss` rather than `AdaptiveLayerLoss` on top of `MultipleNegativesRankingLoss`. I also use [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base) as the base model.
Both of these models were trained on the AllNLI dataset, which is a concatenation of the [SNLI](https://huggingface.co/datasets/snli) and [MultiNLI](https://huggingface.co/datasets/multi_nli) datasets. I have evaluated these models on the [STSBenchmark](https://huggingface.co/datasets/mteb/stsbenchmark-sts) test set using multiple different embedding dimensions. The results are plotted in the following figure:
![adaptive_layer_results](https://huggingface.co/tomaarsen/mpnet-base-nli-adaptive-layer/resolve/main/adaptive_layer_results.png)
The first figure shows that the Adaptive Layer model stays much more performant when reducing the number of layers in the model. This is also clearly shown in the second figure, which displays that 80% of the performance is preserved when the number of layers is reduced all the way to 1.
Lastly, the third figure shows the expected speedup ratio for GPU & CPU devices in my tests. As you can see, removing half of the layers results in roughly a 2x speedup, at a cost of ~15% performance on STSB (~86 -> ~75 Spearman correlation). When removing even more layers, the performance benefit gets larger for CPUs, and between 5x and 10x speedups are very feasible with a 20% loss in performance.
## Training
Training with Adaptive Layer support is quite elementary: rather than applying some loss function on only the last layer, we also apply that same loss function on the pooled embeddings from previous layers. Additionally, we employ a KL-divergence loss that aims to make the embeddings of the non-last layers match that of the last layer. This can be seen as a fascinating approach of [knowledge distillation](../distillation/README.html#knowledge-distillation), but with the last layer as the teacher model and the prior layers as the student models.
For example, with the 12-layer [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base), it will now be trained such that the model produces meaningful embeddings after each of the 12 layers.
```python
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CoSENTLoss, AdaptiveLayerLoss
model = SentenceTransformer("microsoft/mpnet-base")
base_loss = CoSENTLoss(model=model)
loss = AdaptiveLayerLoss(model=model, loss=base_loss)
```
* **Reference**: <a href="../../../docs/package_reference/losses.html#adaptivelayerloss"><code>AdaptiveLayerLoss</code></a>
Note that training with `AdaptiveLayerLoss` is not notably slower than without using it.
Additionally, this can be combined with the `MatryoshkaLoss` such that the resulting model can be reduced both in the number of layers, but also in the size of the output dimensions. See also the [Matryoshka Embeddings](../matryoshka/README.html) for more information on reducing output dimensions. In Sentence Transformers, the combination of these two losses is called `Matryoshka2dLoss`, and a shorthand is provided for simpler training.
```python
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CoSENTLoss, Matryoshka2dLoss
model = SentenceTransformer("microsoft/mpnet-base")
base_loss = CoSENTLoss(model=model)
loss = Matryoshka2dLoss(model=model, loss=base_loss, matryoshka_dims=[768, 512, 256, 128, 64])
```
* **Reference**: <a href="../../../docs/package_reference/losses.html#matryoshka2dloss"><code>Matryoshka2dLoss</code></a>
## Inference
After a model has been trained using the Adaptive Layer loss, you can then truncate the model layers to your desired layer count. Note that this requires doing a bit of surgery on the model itself, and each model is structured a bit differently, so the steps are slightly different depending on the model.
First of all, we will load the model & access the underlying `transformers` model like so:
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("tomaarsen/mpnet-base-nli-adaptive-layer")
# We can access the underlying model with `model[0].auto_model`
print(model[0].auto_model)
```
```
MPNetModel(
(embeddings): MPNetEmbeddings(
(word_embeddings): Embedding(30527, 768, padding_idx=1)
(position_embeddings): Embedding(514, 768, padding_idx=1)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): MPNetEncoder(
(layer): ModuleList(
(0-11): 12 x MPNetLayer(
(attention): MPNetAttention(
(attn): MPNetSelfAttention(
(q): Linear(in_features=768, out_features=768, bias=True)
(k): Linear(in_features=768, out_features=768, bias=True)
(v): Linear(in_features=768, out_features=768, bias=True)
(o): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(intermediate): MPNetIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): MPNetOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(relative_attention_bias): Embedding(32, 12)
)
(pooler): MPNetPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
```
This output will differ depending on the model. We will look for the repeated layers in the encoder. For this MPNet model, this is stored under `model[0].auto_model.encoder.layer`. Then we can slice the model to only keep the first few layers to speed up the model:
```python
new_num_layers = 3
model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:new_num_layers]
```
Then we can run inference with it using <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformers.encode</code></a>.
```python
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
model = SentenceTransformer("tomaarsen/mpnet-base-nli-adaptive-layer")
new_num_layers = 3
model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:new_num_layers]
embeddings = model.encode(
[
"The weather is so nice!",
"It's so sunny outside!",
"He drove to the stadium.",
]
)
# Similarity of the first sentence with the other two
similarities = cos_sim(embeddings[0], embeddings[1:])
# => tensor([[0.7761, 0.1655]])
# compared to tensor([[ 0.7547, -0.0162]]) for the full model
```
As you can see, the similarity between the related sentences is much higher than the unrelated sentence, despite only using 3 layers. Feel free to copy this script locally, modify the `new_num_layers`, and observe the difference in similarities.
## Code Examples
See the following scripts as examples of how to apply the <a href="../../../docs/package_reference/losses.html#adaptivelayerloss"><code>AdaptiveLayerLoss</code></a> in practice:
* **[adaptive_layer_nli.py](adaptive_layer_nli.py)**: This example uses the `MultipleNegativesRankingLoss` with `AdaptiveLayerLoss` to train a strong embedding model using Natural Language Inference (NLI) data. It is an adaptation of the [NLI](../nli/README) documentation.
* **[adaptive_layer_sts.py](adaptive_layer_sts.py)**: This example uses the CoSENTLoss with AdaptiveLayerLoss to train an embedding model on the training set of the STSBenchmark dataset. It is an adaptation of the [STS](../sts/README) documentation.
And the following scripts to see how to apply <a href="../../../docs/package_reference/losses.html#matryoshka2dloss"><code>Matryoshka2dLoss</code></a>:
* **[2d_matryoshka_nli.py](../matryoshka/2d_matryoshka_nli.py)**: This example uses the `MultipleNegativesRankingLoss` with `Matryoshka2dLoss` to train a strong embedding model using Natural Language Inference (NLI) data. It is an adaptation of the [NLI](../nli/README) documentation.
* **[2d_matryoshka_sts.py](../matryoshka/2d_matryoshka_sts.py)**: This example uses the `CoSENTLoss` with `Matryoshka2dLoss` to train an embedding model on the training set of the STSBenchmark dataset. It is an adaptation of the [STS](../sts/README) documentation.
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with AdaptiveLayerLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
Usage:
python adaptive_layer_nli.py
OR
python adaptive_layer_nli.py pretrained_transformer_model_name
"""
import math
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
# Save path of the model
model_save_path = (
"output/adaptive_layer_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Check if dataset exists. If not, download and extract it
nli_dataset_path = "data/AllNLI.tsv.gz"
if not os.path.exists(nli_dataset_path):
util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")
def add_to_samples(sent1, sent2, label):
if sent1 not in train_data:
train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
train_data[sent1][label].add(sent2)
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "train":
sent1 = row["sentence1"].strip()
sent2 = row["sentence2"].strip()
add_to_samples(sent1, sent2, row["label"])
add_to_samples(sent2, sent1, row["label"]) # Also add the opposite
train_samples = []
for sent1, others in train_data.items():
if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
train_samples.append(
InputExample(
texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
)
)
train_samples.append(
InputExample(
texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
)
)
logging.info("Train samples: {}".format(len(train_samples)))
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.AdaptiveLayerLoss(model, train_loss)
stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=int(len(train_dataloader) * 0.1),
warmup_steps=warmup_steps,
output_path=model_save_path,
use_amp=False, # Set to True, if your GPU supports FP16 operations
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=model_save_path)
# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-nli-adaptive-layer")
except Exception:
logging.error(
"Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
f"and saving it using `model.push_to_hub('{model_name}-nli-adaptive-layer')`."
)
"""
This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch.
It uses AdaptiveLayerLoss with the powerful CoSENTLoss to train models that perform well at output dimensions [768, 512, 256, 128, 64].
It generates sentence embeddings that can be compared using cosine-similarity to measure the similarity.
Usage:
python adaptive_layer_sts.py
OR
python adaptive_layer_sts.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = (
"output/adaptive_layer_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CoSENTLoss(model=model)
train_loss = losses.AdaptiveLayerLoss(model, train_loss)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)
# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
try:
model.push_to_hub(f"{model_name}-sts-adaptive-layer")
except Exception:
logging.error(
"Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
f"and saving it using `model.push_to_hub('{model_name}-sts-adaptive-layer')`."
)
"""
This example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers (dense layers) to create a Deep Averaging Network (DAN).
If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/
for available word embeddings files
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import csv
import gzip
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Read the dataset
batch_size = 32
model_save_path = "output/training_stsbenchmark_avg_word_embeddings-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
# Map tokens to traditional word embeddings like GloVe
word_embedding_model = models.WordEmbeddings.from_text_file("glove.6B.300d.txt.gz")
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
# Add two trainable feed-forward networks (DAN)
sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dan1, dan2])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
model.evaluate(evaluator)
"""
This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
Note, you can also pass BERT embeddings to the BiLSTM.
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import csv
import gzip
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Read the dataset
batch_size = 32
model_save_path = "output/training_stsbenchmark_bilstm-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
# Map tokens to traditional word embeddings like GloVe
word_embedding_model = models.WordEmbeddings.from_text_file("glove.6B.300d.txt.gz")
lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=1024)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
lstm.get_word_embedding_dimension(),
pooling_mode_mean_tokens=False,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=True,
)
model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
model.evaluate(evaluator)
"""
This example uses a simple bag-of-words (BoW) approach. A sentence is mapped
to a sparse vector with e.g. 25,000 dimensions. Optionally, you can also use tf-idf.
To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN).
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.models.tokenizer.WordTokenizer import ENGLISH_STOP_WORDS
import logging
from datetime import datetime
import os
import csv
import gzip
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Read the dataset
batch_size = 32
model_save_path = "output/training_tf-idf_word_embeddings-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
##### Construction of the SentenceTransformer Model #####
# Wikipedia document frequency for words
wiki_doc_freq = "wikipedia_doc_frequencies.txt"
if not os.path.exists(wiki_doc_freq):
util.http_get(
"https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt", wiki_doc_freq
)
# Create the vocab for the BoW model
stop_words = ENGLISH_STOP_WORDS
max_vocab_size = 25000 # This is also the size of the BoW sentence vector.
# Read the most common max_vocab_size words. Skip stop-words
vocab = set()
weights = {}
lines = open("wikipedia_doc_frequencies.txt", encoding="utf8").readlines()
num_docs = int(lines[0])
for line in lines[1:]:
word, freq = line.lower().strip().split("\t")
if word in stop_words:
continue
vocab.add(word)
weights[word] = math.log(num_docs / int(freq))
if len(vocab) >= max_vocab_size:
break
##### Construction of the SentenceTransformer Model #####
# Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we
# get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding
bow = models.BoW(vocab=vocab, word_weights=weights, cumulative_term_frequency=True)
# Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions.
sent_embeddings_dimension = max_vocab_size
dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768)
dan2 = models.Dense(in_features=768, out_features=512)
model = SentenceTransformer(modules=[bow, dan1, dan2])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
model.evaluate(evaluator)
"""
This example runs a CNN after the word embedding lookup. The output of the CNN is than pooled,
for example with mean-pooling.
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import csv
import gzip
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Read the dataset
batch_size = 32
model_save_path = "output/training_stsbenchmark_cnn-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
# Map tokens to vectors using BERT
word_embedding_model = models.Transformer("bert-base-uncased")
cnn = models.CNN(
in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(),
out_channels=256,
kernel_sizes=[1, 3, 5],
)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
cnn.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
model.evaluate(evaluator)
"""
This example weights word embeddings (like GloVe) with IDF weights. The IDF weights can for example be computed on Wikipedia.
If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/ for available word embeddings files
You can get term-document frequencies from here:
https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import csv
import gzip
#### Just some code to print debug information to stdout
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Read the dataset
batch_size = 32
model_save_path = "output/training_tf-idf_word_embeddings-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Wikipedia document frequency for words
wiki_doc_freq = "wikipedia_doc_frequencies.txt"
if not os.path.exists(wiki_doc_freq):
util.http_get(
"https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt", wiki_doc_freq
)
# Check if dataset exists. If not, download and extract it
sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
if row["split"] == "dev":
dev_samples.append(inp_example)
elif row["split"] == "test":
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
##### Construction of the SentenceTransformer Model #####
# Map tokens to traditional word embeddings like GloVe
word_embedding_model = models.WordEmbeddings.from_text_file("glove.6B.300d.txt.gz")
# Weight word embeddings using Inverse-Document-Frequency (IDF) values.
# For each word in the vocab ob the tokenizer, we must specify a weight value.
# The word embedding is then multiplied by this value
vocab = word_embedding_model.tokenizer.get_vocab()
word_weights = {}
lines = open(wiki_doc_freq, encoding="utf8").readlines()
num_docs = int(lines[0])
for line in lines[1:]:
word, freq = line.strip().split("\t")
word_weights[word] = math.log(num_docs / int(freq))
# Words in the vocab that are not in the doc_frequencies file get a frequency of 1
unknown_word_weight = math.log(num_docs / 1)
# Initialize the WordWeights model. This model must be between the WordEmbeddings and the Pooling model
word_weights = models.WordWeights(vocab=vocab, word_weights=word_weights, unknown_word_weight=unknown_word_weight)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False,
)
# Add two trainable feed-forward networks (DAN)
sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
model = SentenceTransformer(modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
warmup_steps=warmup_steps,
output_path=model_save_path,
)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
model.evaluate(evaluator)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment