First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/examples/applications/semantic-search/semantic_search_quora_faiss.py
+++ b/examples/applications/semantic-search/semantic_search_quora_faiss.py
+"""
+This example uses Approximate Nearest Neighbor Search (ANN) with FAISS (https://github.com/facebookresearch/faiss).
+
+Searching a large corpus with Millions of embeddings can be time-consuming. To speed this up,
+ANN can index the existent vectors. For a new query vector, this index can be used to find the nearest neighbors.
+
+This nearest neighbor search is not perfect, i.e., it might not perfectly find all top-k nearest neighbors.
+
+In this example, we use FAISS with an inverse flat index (IndexIVFFlat). It learns to partition the corpus embeddings
+into different cluster (number is defined by n_clusters). At search time, the matching cluster for query is found and only vectors
+in this cluster must be search for nearest neighbors.
+
+This script will compare the result from ANN with exact nearest neighbor search and output a Recall@k value
+as well as the missing results in the top-k hits list.
+
+See the FAISS repository, how to install FAISS.
+
+As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (only 100k are used):
+https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs.
+
+As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
+that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
+return the closest questions in the corpus (questions in the corpus are mainly in English).
+"""
+
+from sentence_transformers import SentenceTransformer, util
+import os
+import csv
+import pickle
+import time
+import faiss
+import numpy as np
+
+
+model_name = "quora-distilbert-multilingual"
+model = SentenceTransformer(model_name)
+
+url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
+dataset_path = "quora_duplicate_questions.tsv"
+max_corpus_size = 100000
+
+embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
+
+
+embedding_size = 768  # Size of embeddings
+top_k_hits = 10  # Output k hits
+
+# Defining our FAISS index
+# Number of clusters used for faiss. Select a value 4*sqrt(N) to 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
+n_clusters = 1024
+
+# We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
+quantizer = faiss.IndexFlatIP(embedding_size)
+index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT)
+
+# Number of clusters to explorer at search time. We will search for nearest neighbors in 3 clusters.
+index.nprobe = 3
+
+# Check if embedding cache path exists
+if not os.path.exists(embedding_cache_path):
+    # Check if the dataset exists. If not, download and extract
+    # Download dataset if needed
+    if not os.path.exists(dataset_path):
+        print("Download dataset")
+        util.http_get(url, dataset_path)
+
+    # Get all unique sentences from the file
+    corpus_sentences = set()
+    with open(dataset_path, encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            corpus_sentences.add(row["question1"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+            corpus_sentences.add(row["question2"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+    corpus_sentences = list(corpus_sentences)
+    print("Encode the corpus. This might take a while")
+    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)
+
+    print("Store file on disc")
+    with open(embedding_cache_path, "wb") as fOut:
+        pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
+else:
+    print("Load pre-computed embeddings from disc")
+    with open(embedding_cache_path, "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        corpus_sentences = cache_data["sentences"]
+        corpus_embeddings = cache_data["embeddings"]
+
+### Create the FAISS index
+print("Start creating FAISS index")
+# First, we need to normalize vectors to unit length
+corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]
+
+# Then we train the index to find a suitable clustering
+index.train(corpus_embeddings)
+
+# Finally we add all embeddings to the index
+index.add(corpus_embeddings)
+
+
+######### Search in the index ###########
+
+
+print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+
+while True:
+    inp_question = input("Please enter a question: ")
+
+    start_time = time.time()
+    question_embedding = model.encode(inp_question)
+
+    # FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity
+    question_embedding = question_embedding / np.linalg.norm(question_embedding)
+    question_embedding = np.expand_dims(question_embedding, axis=0)
+
+    # Search in FAISS. It returns a matrix with distances and corpus ids.
+    distances, corpus_ids = index.search(question_embedding, top_k_hits)
+
+    # We extract corpus ids and scores for the first query
+    hits = [{"corpus_id": id, "score": score} for id, score in zip(corpus_ids[0], distances[0])]
+    hits = sorted(hits, key=lambda x: x["score"], reverse=True)
+    end_time = time.time()
+
+    print("Input question:", inp_question)
+    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    for hit in hits[0:top_k_hits]:
+        print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
+
+    # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
+    # Here, we compute the recall of ANN compared to the exact results
+    correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
+    correct_hits_ids = set([hit["corpus_id"] for hit in correct_hits])
+
+    ann_corpus_ids = set([hit["corpus_id"] for hit in hits])
+    if len(ann_corpus_ids) != len(correct_hits_ids):
+        print("Approximate Nearest Neighbor returned a different number of results than expected")
+
+    recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
+    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
+
+    if recall < 1:
+        print("Missing results:")
+        for hit in correct_hits[0:top_k_hits]:
+            if hit["corpus_id"] not in ann_corpus_ids:
+                print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
+    print("\n\n========\n")
--- a/examples/applications/semantic-search/semantic_search_quora_hnswlib.py
+++ b/examples/applications/semantic-search/semantic_search_quora_hnswlib.py
+"""
+This example uses Approximate Nearest Neighbor Search (ANN) with Hnswlib  (https://github.com/nmslib/hnswlib/).
+
+Searching a large corpus with Millions of embeddings can be time-consuming. To speed this up,
+ANN can index the existent vectors. For a new query vector, this index can be used to find the nearest neighbors.
+
+This nearest neighbor search is not perfect, i.e., it might not perfectly find all top-k nearest neighbors.
+
+In this example, we use Hnswlib: It is a fast and easy to use library, with excellent results on common benchmarks.
+
+Usually you can install Hnswlib by running:
+pip install hnswlib
+
+For more details, see https://github.com/nmslib/hnswlib/
+
+As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (we only use 100k in this example):
+https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
+
+As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
+that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
+return the closest questions in the corpus (questions in the corpus are mainly in English).
+"""
+
+from sentence_transformers import SentenceTransformer, util
+import os
+import csv
+import pickle
+import time
+import hnswlib
+
+
+model_name = "quora-distilbert-multilingual"
+model = SentenceTransformer(model_name)
+
+url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
+dataset_path = "quora_duplicate_questions.tsv"
+max_corpus_size = 100000
+
+embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
+
+
+embedding_size = 768  # Size of embeddings
+top_k_hits = 10  # Output k hits
+
+# Check if embedding cache path exists
+if not os.path.exists(embedding_cache_path):
+    # Check if the dataset exists. If not, download and extract
+    # Download dataset if needed
+    if not os.path.exists(dataset_path):
+        print("Download dataset")
+        util.http_get(url, dataset_path)
+
+    # Get all unique sentences from the file
+    corpus_sentences = set()
+    with open(dataset_path, encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            corpus_sentences.add(row["question1"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+            corpus_sentences.add(row["question2"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+    corpus_sentences = list(corpus_sentences)
+    print("Encode the corpus. This might take a while")
+    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)
+
+    print("Store file on disc")
+    with open(embedding_cache_path, "wb") as fOut:
+        pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
+else:
+    print("Load pre-computed embeddings from disc")
+    with open(embedding_cache_path, "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        corpus_sentences = cache_data["sentences"]
+        corpus_embeddings = cache_data["embeddings"]
+
+# Defining our hnswlib index
+index_path = "./hnswlib.index"
+# We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
+index = hnswlib.Index(space="cosine", dim=embedding_size)
+
+if os.path.exists(index_path):
+    print("Loading index...")
+    index.load_index(index_path)
+else:
+    ### Create the HNSWLIB index
+    print("Start creating HNSWLIB index")
+    index.init_index(max_elements=len(corpus_embeddings), ef_construction=400, M=64)
+
+    # Then we train the index to find a suitable clustering
+    index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))
+
+    print("Saving index to:", index_path)
+    index.save_index(index_path)
+
+# Controlling the recall by setting ef:
+index.set_ef(50)  # ef should always be > top_k_hits
+
+######### Search in the index ###########
+
+print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+
+while True:
+    inp_question = input("Please enter a question: ")
+
+    start_time = time.time()
+    question_embedding = model.encode(inp_question)
+
+    # We use hnswlib knn_query method to find the top_k_hits
+    corpus_ids, distances = index.knn_query(question_embedding, k=top_k_hits)
+
+    # We extract corpus ids and scores for the first query
+    hits = [{"corpus_id": id, "score": 1 - score} for id, score in zip(corpus_ids[0], distances[0])]
+    hits = sorted(hits, key=lambda x: x["score"], reverse=True)
+    end_time = time.time()
+
+    print("Input question:", inp_question)
+    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    for hit in hits[0:top_k_hits]:
+        print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
+
+    # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
+    # Here, we compute the recall of ANN compared to the exact results
+    correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
+    correct_hits_ids = set([hit["corpus_id"] for hit in correct_hits])
+
+    ann_corpus_ids = set([hit["corpus_id"] for hit in hits])
+    if len(ann_corpus_ids) != len(correct_hits_ids):
+        print("Approximate Nearest Neighbor returned a different number of results than expected")
+
+    recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
+    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
+
+    if recall < 1:
+        print("Missing results:")
+        for hit in correct_hits[0:top_k_hits]:
+            if hit["corpus_id"] not in ann_corpus_ids:
+                print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
+    print("\n\n========\n")
--- a/examples/applications/semantic-search/semantic_search_quora_pytorch.py
+++ b/examples/applications/semantic-search/semantic_search_quora_pytorch.py
+"""
+This script contains an example how to perform semantic search with PyTorch. It performs exact nearest neighborh search.
+
+As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions (we only use about 100k):
+https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
+
+
+As embeddings model, we use the SBERT model 'quora-distilbert-multilingual',
+that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
+return the closest questions in the corpus (questions in the corpus are mainly in English).
+
+
+Google Colab example: https://colab.research.google.com/drive/12cn5Oo0v3HfQQ8Tv6-ukgxXSmT3zl35A?usp=sharing
+"""
+
+from sentence_transformers import SentenceTransformer, util
+import os
+import csv
+import pickle
+import time
+
+
+model_name = "quora-distilbert-multilingual"
+model = SentenceTransformer(model_name)
+
+url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
+dataset_path = "quora_duplicate_questions.tsv"
+max_corpus_size = 100000
+
+
+embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
+
+
+# Check if embedding cache path exists
+if not os.path.exists(embedding_cache_path):
+    # Check if the dataset exists. If not, download and extract
+    # Download dataset if needed
+    if not os.path.exists(dataset_path):
+        print("Download dataset")
+        util.http_get(url, dataset_path)
+
+    # Get all unique sentences from the file
+    corpus_sentences = set()
+    with open(dataset_path, encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            corpus_sentences.add(row["question1"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+            corpus_sentences.add(row["question2"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+    corpus_sentences = list(corpus_sentences)
+    print("Encode the corpus. This might take a while")
+    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)
+
+    print("Store file on disc")
+    with open(embedding_cache_path, "wb") as fOut:
+        pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
+else:
+    print("Load pre-computed embeddings from disc")
+    with open(embedding_cache_path, "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        corpus_sentences = cache_data["sentences"][0:max_corpus_size]
+        corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]
+
+###############################
+print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+
+# Move embeddings to the target device of the model
+corpus_embeddings = corpus_embeddings.to(model.device)
+
+while True:
+    inp_question = input("Please enter a question: ")
+
+    start_time = time.time()
+    question_embedding = model.encode(inp_question, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, corpus_embeddings)
+    end_time = time.time()
+    hits = hits[0]  # Get the hits for the first query
+
+    print("Input question:", inp_question)
+    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    for hit in hits[0:5]:
+        print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
+
+    print("\n\n========\n")
--- a/examples/applications/semantic-search/semantic_search_wikipedia_qa.py
+++ b/examples/applications/semantic-search/semantic_search_wikipedia_qa.py
+"""
+This examples demonstrates the setup for Question-Answer-Retrieval.
+
+You can input a query or a question. The script then uses semantic search
+to find relevant passages in Simple English Wikipedia (as it is smaller and fits better in RAM).
+
+As model, we use: nq-distilbert-base-v1
+
+It was trained on the Natural Questions dataset, a dataset with real questions from Google Search
+together with annotated data from Wikipedia providing the answer. For the passages, we encode the
+Wikipedia article tile together with the individual text passages.
+
+Google Colab Example: https://colab.research.google.com/drive/11GunvCqJuebfeTlgbJWkIMT0xJH6PWF1?usp=sharing
+"""
+
+import json
+from sentence_transformers import SentenceTransformer, util
+import time
+import gzip
+import os
+import torch
+
+# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
+model_name = "nq-distilbert-base-v1"
+bi_encoder = SentenceTransformer(model_name)
+top_k = 5  # Number of passages we want to retrieve with the bi-encoder
+
+# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
+# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder
+
+wikipedia_filepath = "data/simplewiki-2020-11-01.jsonl.gz"
+
+if not os.path.exists(wikipedia_filepath):
+    util.http_get("http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz", wikipedia_filepath)
+
+passages = []
+with gzip.open(wikipedia_filepath, "rt", encoding="utf8") as fIn:
+    for line in fIn:
+        data = json.loads(line.strip())
+        for paragraph in data["paragraphs"]:
+            # We encode the passages as [title, text]
+            passages.append([data["title"], paragraph])
+
+# If you like, you can also limit the number of passages you want to use
+print("Passages:", len(passages))
+
+# To speed things up, pre-computed embeddings are downloaded.
+# The provided file encoded the passages with the model 'nq-distilbert-base-v1'
+if model_name == "nq-distilbert-base-v1":
+    embeddings_filepath = "simplewiki-2020-11-01-nq-distilbert-base-v1.pt"
+    if not os.path.exists(embeddings_filepath):
+        util.http_get("http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt", embeddings_filepath)
+
+    corpus_embeddings = torch.load(embeddings_filepath)
+    corpus_embeddings = corpus_embeddings.float()  # Convert embedding file to float
+    device = util.get_device_name()
+    corpus_embeddings = corpus_embeddings.to(device)
+else:  # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
+    corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
+
+while True:
+    query = input("Please enter a question: ")
+
+    # Encode the query using the bi-encoder and find potentially relevant passages
+    start_time = time.time()
+    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
+    hits = hits[0]  # Get the hits for the first query
+
+    end_time = time.time()
+
+    # Output of top-k hits
+    print("Input question:", query)
+    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    for hit in hits:
+        print("\t{:.3f}\t{}".format(hit["score"], passages[hit["corpus_id"]]))
+
+    print("\n\n========\n")
--- a/examples/applications/text-summarization/LexRank.py
+++ b/examples/applications/text-summarization/LexRank.py
+"""
+LexRank implementation
+Source: https://github.com/crabcamp/lexrank/tree/dev
+"""
+
+import numpy as np
+from scipy.sparse.csgraph import connected_components
+from scipy.special import softmax
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def degree_centrality_scores(
+    similarity_matrix,
+    threshold=None,
+    increase_power=True,
+):
+    if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
+        raise ValueError(
+            "'threshold' should be a floating-point number " "from the interval [0, 1) or None",
+        )
+
+    if threshold is None:
+        markov_matrix = create_markov_matrix(similarity_matrix)
+
+    else:
+        markov_matrix = create_markov_matrix_discrete(
+            similarity_matrix,
+            threshold,
+        )
+
+    scores = stationary_distribution(
+        markov_matrix,
+        increase_power=increase_power,
+        normalized=False,
+    )
+
+    return scores
+
+
+def _power_method(transition_matrix, increase_power=True, max_iter=10000):
+    eigenvector = np.ones(len(transition_matrix))
+
+    if len(eigenvector) == 1:
+        return eigenvector
+
+    transition = transition_matrix.transpose()
+
+    for _ in range(max_iter):
+        eigenvector_next = np.dot(transition, eigenvector)
+
+        if np.allclose(eigenvector_next, eigenvector):
+            return eigenvector_next
+
+        eigenvector = eigenvector_next
+
+        if increase_power:
+            transition = np.dot(transition, transition)
+
+    logger.warning("Maximum number of iterations for power method exceeded without convergence!")
+    return eigenvector_next
+
+
+def connected_nodes(matrix):
+    _, labels = connected_components(matrix)
+
+    groups = []
+
+    for tag in np.unique(labels):
+        group = np.where(labels == tag)[0]
+        groups.append(group)
+
+    return groups
+
+
+def create_markov_matrix(weights_matrix):
+    n_1, n_2 = weights_matrix.shape
+    if n_1 != n_2:
+        raise ValueError("'weights_matrix' should be square")
+
+    row_sum = weights_matrix.sum(axis=1, keepdims=True)
+
+    # normalize probability distribution differently if we have negative transition values
+    if np.min(weights_matrix) <= 0:
+        return softmax(weights_matrix, axis=1)
+
+    return weights_matrix / row_sum
+
+
+def create_markov_matrix_discrete(weights_matrix, threshold):
+    discrete_weights_matrix = np.zeros(weights_matrix.shape)
+    ixs = np.where(weights_matrix >= threshold)
+    discrete_weights_matrix[ixs] = 1
+
+    return create_markov_matrix(discrete_weights_matrix)
+
+
+def stationary_distribution(
+    transition_matrix,
+    increase_power=True,
+    normalized=True,
+):
+    n_1, n_2 = transition_matrix.shape
+    if n_1 != n_2:
+        raise ValueError("'transition_matrix' should be square")
+
+    distribution = np.zeros(n_1)
+
+    grouped_indices = connected_nodes(transition_matrix)
+
+    for group in grouped_indices:
+        t_matrix = transition_matrix[np.ix_(group, group)]
+        eigenvector = _power_method(t_matrix, increase_power=increase_power)
+        distribution[group] = eigenvector
+
+    if normalized:
+        distribution /= n_1
+
+    return distribution
--- a/examples/applications/text-summarization/README.md
+++ b/examples/applications/text-summarization/README.md
+# Text Summarization
+
+SentenceTransformers can be used for (extractive) text summarization: The document is broken down into sentences and embedded by SentenceTransformers. Then, we can compute the cosine similarity across all possible sentence combinations.
+
+We then use [LexRank](https://www.aaai.org/Papers/JAIR/Vol22/JAIR-2214.pdf) to find the most central sentences in the document. These central sentences form a good basis for a summarization of the document.
+
+An example is shown in [text-summarization.py](text-summarization.py)
\ No newline at end of file
--- a/examples/applications/text-summarization/text-summarization.py
+++ b/examples/applications/text-summarization/text-summarization.py
+"""
+This example uses LexRank (https://www.aaai.org/Papers/JAIR/Vol22/JAIR-2214.pdf)
+to create an extractive summarization of a long document.
+
+The document is split into sentences using NLTK, then the sentence embeddings are computed. We
+then compute the cosine-similarity across all possible sentence pairs.
+
+We then use LexRank to find the most central sentences in the document, which form our summary.
+
+Input document: First section from the English Wikipedia Section
+Output summary:
+Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass.
+New York City (NYC), often called simply New York, is the most populous city in the United States.
+Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
+New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports.
+If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world.
+
+Note: Requires NLTK: `pip install nltk`
+"""
+
+import nltk
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+from LexRank import degree_centrality_scores
+
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Our input document we want to summarize
+# As example, we take the first section from Wikipedia
+document = """
+New York City (NYC), often called simply New York, is the most populous city in the United States. With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass. With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the world's most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.
+
+Situated on one of the world's largest natural harbors, New York City is composed of five boroughs, each of which is a county of the State of New York. The five boroughs—Brooklyn, Queens, Manhattan, the Bronx, and Staten Island—were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world as of 2016. As of 2019, the New York metropolitan area is estimated to produce a gross metropolitan product (GMP) of $2.0 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York is home to the highest number of billionaires of any city in the world.
+
+New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was subsequently renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity, entrepreneurship, and environmental sustainability, and as a symbol of freedom and cultural diversity. In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity.
+
+Many districts and landmarks in New York City are well known, including three of the world's ten most visited tourist attractions in 2013. A record 62.8 million tourists visited New York City in 2017. Times Square is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. Many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. Providing continuous 24/7 service and contributing to the nickname The City that Never Sleeps, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. The city has over 120 colleges and universities, including Columbia University, New York University, Rockefeller University, and the City University of New York system, which is the largest urban public university system in the United States. Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
+"""
+
+# Split the document into sentences
+sentences = nltk.sent_tokenize(document)
+print("Num sentences:", len(sentences))
+
+# Compute the sentence embeddings
+embeddings = model.encode(sentences, convert_to_tensor=True)
+
+# Compute the pair-wise cosine similarities
+cos_scores = util.cos_sim(embeddings, embeddings).numpy()
+
+# Compute the centrality for each sentence
+centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
+
+# We argsort so that the first element is the sentence with the highest score
+most_central_sentence_indices = np.argsort(-centrality_scores)
+
+
+# Print the 5 sentences with the highest scores
+print("\n\nSummary:")
+for idx in most_central_sentence_indices[0:5]:
+    print(sentences[idx].strip())
--- a/examples/domain_adaptation/README.md
+++ b/examples/domain_adaptation/README.md
+# Domain Adaptation
+
+The goal of **Domain Adaptation** is to adapt text embedding models to your specific text domain without the need to have labeled training data.
+
+Domain adaptation is still an active research field and there exists no perfect solution yet. However, in our two recent papers [TSDAE](https://arxiv.org/abs/2104.06979) and [GPL](https://arxiv.org/abs/2112.07577) we evaluated several methods how text embeddings model can be adapted to your specific domain. You can find an overview of these methods in my [talk on unsupervised domain adaptation](https://youtu.be/xbdLowiQTlk).
+
+## Domain Adaptation vs. Unsupervised Learning
+There exists methods for [unsupervised text embedding learning](../unsupervised_learning/README.md), however, they generally perform rather badly: They are not really able to learn domain specific concepts. 
+
+A much better approach is domain adaptation: Here you have an unlabeled corpus from your specific domain together with an existing labeled corpus. You can find many suitable labeled training datasets here: [embedding-training-data](https://huggingface.co/datasets/sentence-transformers/embedding-training-data)  
+
+## Adaptive Pre-Training
+
+When using adaptive pre-training, you first pre-train on your target corpus using e.g. [Masked Language Modeling](../unsupervised_learning/MLM/README.md) or [TSDAE](../unsupervised_learning/TSDAE/README.md) and then you fine-tune on an existing training dataset (see [embedding-training-data](https://huggingface.co/datasets/sentence-transformers/embedding-training-data)). 
+
+![Adaptive Pre-Training](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/adaptive_pre-training.png) 
+
+In our paper [TSDAE](https://arxiv.org/abs/2104.06979) we evaluated several methods for domain adaptation on 4 domain specific sentence embedding tasks:  
+
+| Approach | AskUbuntu | CQADupStack | Twitter | SciDocs | Avg |
+| -------- | :-------: | :---------: | :-----: | :-----: | :---: |
+| Zero-Shot Model | 54.5 | 12.9 | 72.2 | 69.4 | 52.3 |
+| TSDAE | 59.4 | **14.4** | **74.5** | **77.6** | **56.5** |
+| MLM | **60.6** | 14.3 | 71.8 |  76.9 | 55.9 |
+| CT | 56.4 | 13.4 | 72.4 |  69.7 | 53.0 |
+| SimCSE | 56.2 | 13.1 | 71.4 | 68.9 | 52.4 |
+
+As we can see, the performance can improve up-to 8 points when you first perform pre-training on your specific corpus and then fine-tune on provided labeled training data.
+
+In  [GPL](https://arxiv.org/abs/2112.07577) we evaluate these methods for semantic search: Given a short query, find the relevant passage. Here, performance can improve up to 10 points:
+
+| Approach | FiQA  | SciFact | BioASQ | TREC-COVID | CQADupStack | Robust04 | Avg |
+| -------- | :---: | :-----: | :----: | :--------: | :---------: | :------: | :---: |
+| Zero-Shot Model | 26.7 | 57.1 | 52.9 | 66.1 | 29.6 | 39.0 | 45.2 |
+| TSDAE | 29.3 | **62.8** | **55.5** | **76.1** | **31.8** | **39.4** | **49.2** |
+| MLM | **30.2** | 60.0 | 51.3 | 69.5 | 30.4 | 38.8 | 46.7 |
+| ICT | 27.0 | 58.3 | 55.3 | 69.7 | 31.3 | 37.4 | 46.5 |
+| SimCSE | 26.7 | 55.0 | 53.2 | 68.3 | 29.0 | 37.9 | 45.0 |
+| CD | 27.0 | 62.7 | 47.7 | 65.4 | 30.6 | 34.5 | 44.7 |
+| CT | 28.3 | 55.6 | 49.9 | 63.8 | 30.5 | 35.9 | 44.0 |
+
+A big **disadvantage of adaptive pre-training** is the high computational overhead, as you must first run pre-training on your corpus and then supervised learning on a labeled training dataset. The labeled trained datasets can be quite large (e.g. the `all-*-v1` models had been trained on over 1 billion training pairs).
+
+
+## GPL: Generative Pseudo-Labeling
+
+[GPL](https://arxiv.org/abs/2112.07577) overcomes the aforementioned issue: It can be applied on-top of a fine-tuned model. Hence, you can use one of the [pre-trained models](https://www.sbert.net/docs/pretrained_models.html) and adapt it to your specific domain:
+
+![GPL_Overview](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_overview.png) 
+
+
+The longer you train, the better your model gets. In our experiments, we were training the models for about 1 day on a V100-GPU. GPL can be combined with adaptive pre-training, which can give another performance boost.
+
+
+![GPL_Steps](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_steps.png) 
+
+### GPL Steps
+
+GPL works in three phases:
+
+![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_architecture.png) 
+
+- **Query Generation**: For a given text from our domain, we first use a T5 model that generates a possible query for the given text. E.g. when your text is *"Python is a high-level general-purpose programming language"*, the model might generate a query like *"What is Python"*. You can find various query generators on our [doc2query-hub](https://huggingface.co/doc2query).
+- **Negative Mining**: Next, for the generate query *"What is Python"* we mine negative passages from our corpus, i.e. passages that are similar to the query but don't which a user would not consider relevant. Such a negative passage could be *"Java is a high-level, class-based, object-oriented programming language."*. We do this mining using dense retrieval, i.e. we use one of the existing text embedding models and retrieve relevant paragraphs for the given query.
+- **Pseudo Labeling**: It might be that in the negative mining step we retrieve a passage that is actually relevant for the query (like another definition for *"What is Python"*). To overcome this issue, we use a [Cross-Encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) to score all (query, passage)-pairs. 
+- **Training**: Once we have the triplets *(generated query, positive passage, mined negative passage)* and the Cross-Encoder scores for *(query, positive)* and *(query, negative)* we can start training the text embedding model using [MarginMSELoss](https://www.sbert.net/docs/package_reference/losses.html#marginmseloss).
+
+
+The **pseudo labeling** step is quite important and which results in the increased performance compared to the previous method QGen, which treated passages just as positive (1) or negative (0). As we see in the following picture, for a generate query (*"what is futures contract"*), the negative mining step retrieves passages that are partly or highly relevant to the generated query. Using MarginMSELoss and the Cross-Encoder, we can identify these passages and teach the text embedding model that these passages are also relevant for the given query.
+![GPL Architecture](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/gpl_negatives.jpg) 
+
+
+The following tables gives an overview of GPL in comparison to adaptive pre-training (MLM and TSDAE). As mentioned, GPL can be combined with adaptive pre-training.
+
+| Approach | FiQA  | SciFact | BioASQ | TREC-COVID | CQADupStack | Robust04 | Avg |
+| -------- | :---: | :-----: | :----: | :--------: | :---------: | :------: | :---: |
+| Zero-Shot model | 26.7 | 57.1 | 52.9 | 66.1 | 29.6 | 39.0 | 45.2 |
+| TSDAE + GPL | **33.3** | **67.3** | **62.8** | 74.0 | **35.1** | **42.1** | **52.4** |
+| GPL | 33.1 | 65.2 | 61.6 | 71.7 | 34.4 | **42.1** | 51.4 |
+| TSDAE | 29.3 | 62.8 | 55.5 | **76.1** | 31.8 | 39.4 | 49.2 |
+| MLM | 30.2 | 60.0 | 51.3 | 69.5 | 30.4 | 38.8 | 46.7 |
+
+
+
+### GPL Code
+You can find the code for GPL here: [https://github.com/UKPLab/gpl](https://github.com/UKPLab/gpl)
+
+We made the code simple to use, so that you just need to pass your corpus and everything else is handled by the training code.
+
+
+### Citation
+
+If you find these resources helpful, feel free to cite our papers.
+
+ [TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning](https://arxiv.org/abs/2104.06979)
+```bibtex 
+@inproceedings{wang-2021-TSDAE,
+    title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
+    author = "Wang, Kexin and Reimers, Nils and Gurevych, Iryna", 
+    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
+    month = nov,
+    year = "2021",
+    address = "Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    pages = "671--688",
+    url = "https://arxiv.org/abs/2104.06979",
+}
+```
+
+[GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval](https://arxiv.org/abs/2112.07577):
+```bibtex  
+@inproceedings{wang-2021-GPL,
+    title = "GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval",
+    author = "Wang, Kexin and Thakur, Nandan and Reimers, Nils and Gurevych, Iryna", 
+    journal= "arXiv preprint arXiv:2112.07577",
+    month = "12",
+    year = "2021",
+    url = "https://arxiv.org/abs/2112.07577",
+}
+```
--- a/examples/evaluation/evaluation_inference_speed.py
+++ b/examples/evaluation/evaluation_inference_speed.py
+"""
+This examples measures the inference speed of a certain model
+
+Usage:
+python evaluation_inference_speed.py
+OR
+python evaluation_inference_speed.py model_name
+"""
+
+from sentence_transformers import SentenceTransformer, util
+import sys
+import os
+import time
+import torch
+import gzip
+import csv
+
+# Limit torch to 4 threads
+torch.set_num_threads(4)
+
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-nli-mean-tokens"
+
+# Load a named sentence model (based on BERT). This will download the model from our server.
+# Alternatively, you can also pass a filepath to SentenceTransformer()
+model = SentenceTransformer(model_name)
+
+
+nli_dataset_path = "datasets/AllNLI.tsv.gz"
+sentences = set()
+max_sentences = 100000
+
+
+# Download datasets if needed
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        sentences.add(row["sentence1"])
+        if len(sentences) >= max_sentences:
+            break
+
+sentences = list(sentences)
+print("Model Name:", model_name)
+print("Number of sentences:", len(sentences))
+
+for i in range(3):
+    print("Run", i)
+    start_time = time.time()
+    emb = model.encode(sentences, batch_size=32)
+    end_time = time.time()
+    diff_time = end_time - start_time
+    print("Done after {:.2f} seconds".format(diff_time))
+    print("Speed: {:.2f} sentences / second".format(len(sentences) / diff_time))
+    print("=====")
--- a/examples/evaluation/evaluation_stsbenchmark.py
+++ b/examples/evaluation/evaluation_stsbenchmark.py
+"""
+This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
+
+Usage:
+python evaluation_stsbenchmark.py
+OR
+python evaluation_stsbenchmark.py model_name
+"""
+
+from sentence_transformers import SentenceTransformer, util, LoggingHandler, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+import logging
+import sys
+import torch
+import gzip
+import os
+import csv
+
+script_folder_path = os.path.dirname(os.path.realpath(__file__))
+
+# Limit torch to 4 threads
+torch.set_num_threads(4)
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "stsb-distilroberta-base-v2"
+
+# Load a named sentence model (based on BERT). This will download the model from our server.
+# Alternatively, you can also pass a filepath to SentenceTransformer()
+model = SentenceTransformer(model_name)
+
+
+sts_dataset_path = "data/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+model.evaluate(evaluator)
+
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+model.evaluate(evaluator)
--- a/examples/evaluation/evaluation_translation_matching.py
+++ b/examples/evaluation/evaluation_translation_matching.py
+"""
+Given a tab separated file (.tsv) with parallel sentences, where the second column is the translation of the sentence in the first column, for example, in the format:
+src1    trg1
+src2    trg2
+...
+
+where trg_i is the translation of src_i.
+
+Given src_i, the TranslationEvaluator checks which trg_j has the highest similarity using cosine similarity. If i == j, we assume
+a match, i.e., the correct translation has been found for src_i out of all possible target sentences.
+
+It then computes an accuracy over all possible source sentences src_i. Equivalently, it computes also the accuracy for the other direction.
+
+A high accuracy score indicates that the model is able to find the correct translation out of a large pool with sentences.
+
+Usage:
+python [model_name_or_path] [parallel-file1] [parallel-file2] ...
+
+For example:
+python distiluse-base-multilingual-cased  talks-en-de.tsv.gz
+
+See the training_multilingual/get_parallel_data_...py scripts for getting parallel sentence data from different sources
+"""
+
+from sentence_transformers import SentenceTransformer, evaluation, LoggingHandler
+import sys
+import gzip
+import os
+import logging
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+
+logger = logging.getLogger(__name__)
+
+model_name = sys.argv[1]
+filepaths = sys.argv[2:]
+inference_batch_size = 32
+
+model = SentenceTransformer(model_name)
+
+
+for filepath in filepaths:
+    src_sentences = []
+    trg_sentences = []
+    with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+        filepath, "r", encoding="utf8"
+    ) as fIn:
+        for line in fIn:
+            splits = line.strip().split("\t")
+            if len(splits) >= 2:
+                src_sentences.append(splits[0])
+                trg_sentences.append(splits[1])
+
+    logger.info(os.path.basename(filepath) + ": " + str(len(src_sentences)) + " sentence pairs")
+    dev_trans_acc = evaluation.TranslationEvaluator(
+        src_sentences, trg_sentences, name=os.path.basename(filepath), batch_size=inference_batch_size
+    )
+    dev_trans_acc(model)
--- a/examples/training/README.md
+++ b/examples/training/README.md
+# Training
+
+This folder contains various examples to fine-tune `SentenceTransformers` for specific tasks.
+
+For the beginning, I can recommend to have a look at the Semantic Textual Similarity ([STS](sts/)) or the Natural Language Inference ([NLI](nli/)) examples. 
+
+For the documentation how to train your own models, see [Training Overview](http://www.sbert.net/docs/training/overview.html).
+
+## Training Examples
+- [avg_word_embeddings](avg_word_embeddings/) - This folder contains examples to train models based on classical word embeddings like GloVe. These models are extremely fast, but are a more inaccuracte than transformers based models.
+- [distillation](distillation/) - Examples to make models smaller, faster and lighter.
+- [multilingual](multilingual/) - Existent monolingual models can be extend to various languages ([paper](https://arxiv.org/abs/2004.09813)). This folder contains a step-by-step guide to extend existent models to new languages. 
+- [nli](nli/) - Natural Language Inference (NLI) data can be quite helpful to pre-train and fine-tune models to create meaningful sentence embeddings.
+- [quora_duplicate_questions](quora_duplicate_questions/) - Quora Duplicate Questions is large set corpus with duplicate questions from the Quora community. The folder contains examples how to train models for duplicate questions mining and for semantic search.
+- [sts](sts/) - The most basic method to train models is using Semantic Textual Similarity (STS) data. Here, we have a sentence pair and a score indicating the semantic similarity.
+- [other](other/) - Various tiny examples for show-casing one specific training case.
+
--- a/examples/training/adaptive_layer/README.md
+++ b/examples/training/adaptive_layer/README.md
+# Adaptive Layers
+
+Embedding models are often encoder models with numerous layers, such as 12 (e.g. [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)) or 6 (e.g. [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)). To get embeddings, every single one of these layers must be traversed. [2D Matryoshka Sentence Embeddings](https://arxiv.org/abs/2402.14776) (2DMSE) revisits  this concept by proposing an approach to train embedding models that will perform well when only using a selection of all layers. This results in faster inference speeds at relatively low performance costs.
+
+## Use Cases
+
+The 2DMSE paper mentions that using a few layers of a larger model trained using Adaptive Layers and Matryoshka Representation Learning can outperform a smaller model that was trained like a standard embedding model.
+
+## Results
+
+Let's look at the performance that we may be able to expect from an Adaptive Layer embedding model versus a regular embedding model. For this experiment, I have trained two models:
+
+* [tomaarsen/mpnet-base-nli-adaptive-layer](https://huggingface.co/tomaarsen/mpnet-base-nli-adaptive-layer): Trained by running [adaptive_layer_nli.py](adaptive_layer_nli.py) with [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base).
+* [tomaarsen/mpnet-base-nli](https://huggingface.co/tomaarsen/mpnet-base-nli): A near identical model as the former, but using only `MultipleNegativesRankingLoss` rather than `AdaptiveLayerLoss` on top of `MultipleNegativesRankingLoss`. I also use [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base) as the base model.
+
+Both of these models were trained on the AllNLI dataset, which is a concatenation of the [SNLI](https://huggingface.co/datasets/snli) and [MultiNLI](https://huggingface.co/datasets/multi_nli) datasets. I have evaluated these models on the [STSBenchmark](https://huggingface.co/datasets/mteb/stsbenchmark-sts) test set using multiple different embedding dimensions. The results are plotted in the following figure:
+
+![adaptive_layer_results](https://huggingface.co/tomaarsen/mpnet-base-nli-adaptive-layer/resolve/main/adaptive_layer_results.png)
+
+The first figure shows that the Adaptive Layer model stays much more performant when reducing the number of layers in the model. This is also clearly shown in the second figure, which displays that 80% of the performance is preserved when the number of layers is reduced all the way to 1.
+
+Lastly, the third figure shows the expected speedup ratio for GPU & CPU devices in my tests. As you can see, removing half of the layers results in roughly a 2x speedup, at a cost of ~15% performance on STSB (~86 -> ~75 Spearman correlation). When removing even more layers, the performance benefit gets larger for CPUs, and between 5x and 10x speedups are very feasible with a 20% loss in performance.
+
+## Training
+
+Training with Adaptive Layer support is quite elementary: rather than applying some loss function on only the last layer, we also apply that same loss function on the pooled embeddings from previous layers. Additionally, we employ a KL-divergence loss that aims to make the embeddings of the non-last layers match that of the last layer. This can be seen as a fascinating approach of [knowledge distillation](../distillation/README.html#knowledge-distillation), but with the last layer as the teacher model and the prior layers as the student models.
+
+For example, with the 12-layer [microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base), it will now be trained such that the model produces meaningful embeddings after each of the 12 layers.
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.losses import CoSENTLoss, AdaptiveLayerLoss
+
+model = SentenceTransformer("microsoft/mpnet-base")
+
+base_loss = CoSENTLoss(model=model)
+loss = AdaptiveLayerLoss(model=model, loss=base_loss)
+```
+* **Reference**: <a href="../../../docs/package_reference/losses.html#adaptivelayerloss"><code>AdaptiveLayerLoss</code></a>
+
+Note that training with `AdaptiveLayerLoss` is not notably slower than without using it.
+
+Additionally, this can be combined with the `MatryoshkaLoss` such that the resulting model can be reduced both in the number of layers, but also in the size of the output dimensions. See also the [Matryoshka Embeddings](../matryoshka/README.html) for more information on reducing output dimensions. In Sentence Transformers, the combination of these two losses is called `Matryoshka2dLoss`, and a shorthand is provided for simpler training.
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.losses import CoSENTLoss, Matryoshka2dLoss
+
+model = SentenceTransformer("microsoft/mpnet-base")
+
+base_loss = CoSENTLoss(model=model)
+loss = Matryoshka2dLoss(model=model, loss=base_loss, matryoshka_dims=[768, 512, 256, 128, 64])
+```
+
+* **Reference**: <a href="../../../docs/package_reference/losses.html#matryoshka2dloss"><code>Matryoshka2dLoss</code></a>
+
+## Inference
+
+After a model has been trained using the Adaptive Layer loss, you can then truncate the model layers to your desired layer count. Note that this requires doing a bit of surgery on the model itself, and each model is structured a bit differently, so the steps are slightly different depending on the model.
+
+First of all, we will load the model & access the underlying `transformers` model like so:
+
+```python
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("tomaarsen/mpnet-base-nli-adaptive-layer")
+
+# We can access the underlying model with `model[0].auto_model`
+print(model[0].auto_model)
+```
+```
+MPNetModel(
+  (embeddings): MPNetEmbeddings(
+    (word_embeddings): Embedding(30527, 768, padding_idx=1)
+    (position_embeddings): Embedding(514, 768, padding_idx=1)
+    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+    (dropout): Dropout(p=0.1, inplace=False)
+  )
+  (encoder): MPNetEncoder(
+    (layer): ModuleList(
+      (0-11): 12 x MPNetLayer(
+        (attention): MPNetAttention(
+          (attn): MPNetSelfAttention(
+            (q): Linear(in_features=768, out_features=768, bias=True)
+            (k): Linear(in_features=768, out_features=768, bias=True)
+            (v): Linear(in_features=768, out_features=768, bias=True)
+            (o): Linear(in_features=768, out_features=768, bias=True)
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+        )
+        (intermediate): MPNetIntermediate(
+          (dense): Linear(in_features=768, out_features=3072, bias=True)
+          (intermediate_act_fn): GELUActivation()
+        )
+        (output): MPNetOutput(
+          (dense): Linear(in_features=3072, out_features=768, bias=True)
+          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+        )
+      )
+    )
+    (relative_attention_bias): Embedding(32, 12)
+  )
+  (pooler): MPNetPooler(
+    (dense): Linear(in_features=768, out_features=768, bias=True)
+    (activation): Tanh()
+  )
+)
+```
+This output will differ depending on the model. We will look for the repeated layers in the encoder. For this MPNet model, this is stored under `model[0].auto_model.encoder.layer`. Then we can slice the model to only keep the first few layers to speed up the model:
+
+```python
+new_num_layers = 3
+model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:new_num_layers]
+```
+
+Then we can run inference with it using <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformers.encode</code></a>. 
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+
+model = SentenceTransformer("tomaarsen/mpnet-base-nli-adaptive-layer")
+new_num_layers = 3
+model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:new_num_layers]
+
+embeddings = model.encode(
+    [
+        "The weather is so nice!",
+        "It's so sunny outside!",
+        "He drove to the stadium.",
+    ]
+)
+# Similarity of the first sentence with the other two
+similarities = cos_sim(embeddings[0], embeddings[1:])
+# => tensor([[0.7761, 0.1655]])
+# compared to tensor([[ 0.7547, -0.0162]]) for the full model
+```
+As you can see, the similarity between the related sentences is much higher than the unrelated sentence, despite only using 3 layers. Feel free to copy this script locally, modify the `new_num_layers`, and observe the difference in similarities.
+
+
+## Code Examples
+
+See the following scripts as examples of how to apply the <a href="../../../docs/package_reference/losses.html#adaptivelayerloss"><code>AdaptiveLayerLoss</code></a> in practice:
+
+* **[adaptive_layer_nli.py](adaptive_layer_nli.py)**: This example uses the `MultipleNegativesRankingLoss` with `AdaptiveLayerLoss` to train a strong embedding model using Natural Language Inference (NLI) data. It is an adaptation of the [NLI](../nli/README) documentation.
+* **[adaptive_layer_sts.py](adaptive_layer_sts.py)**: This example uses the CoSENTLoss with AdaptiveLayerLoss to train an embedding model on the training set of the STSBenchmark dataset. It is an adaptation of the [STS](../sts/README) documentation.
+
+And the following scripts to see how to apply <a href="../../../docs/package_reference/losses.html#matryoshka2dloss"><code>Matryoshka2dLoss</code></a>:
+* **[2d_matryoshka_nli.py](../matryoshka/2d_matryoshka_nli.py)**: This example uses the `MultipleNegativesRankingLoss` with `Matryoshka2dLoss` to train a strong embedding model using Natural Language Inference (NLI) data. It is an adaptation of the [NLI](../nli/README) documentation.
+* **[2d_matryoshka_sts.py](../matryoshka/2d_matryoshka_sts.py)**: This example uses the `CoSENTLoss` with `Matryoshka2dLoss` to train an embedding model on the training set of the STSBenchmark dataset. It is an adaptation of the [STS](../sts/README) documentation.
--- a/examples/training/adaptive_layer/adaptive_layer_nli.py
+++ b/examples/training/adaptive_layer/adaptive_layer_nli.py
+"""
+The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
+with AdaptiveLayerLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
+Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
+At every 10% training steps, the model is evaluated on the STS benchmark dataset
+
+Usage:
+python adaptive_layer_nli.py
+
+OR
+python adaptive_layer_nli.py pretrained_transformer_model_name
+"""
+
+import math
+from datasets import load_dataset
+from sentence_transformers import models, losses, datasets
+from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+import random
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilroberta-base"
+train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
+max_seq_length = 75
+num_epochs = 1
+
+# Save path of the model
+model_save_path = (
+    "output/adaptive_layer_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+# Here we define our SentenceTransformer model
+word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Check if dataset exists. If not, download and extract  it
+nli_dataset_path = "data/AllNLI.tsv.gz"
+
+if not os.path.exists(nli_dataset_path):
+    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)
+
+# Read the AllNLI.tsv.gz file and create the training dataset
+logging.info("Read AllNLI train dataset")
+
+
+def add_to_samples(sent1, sent2, label):
+    if sent1 not in train_data:
+        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+    train_data[sent1][label].add(sent2)
+
+
+train_data = {}
+with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        if row["split"] == "train":
+            sent1 = row["sentence1"].strip()
+            sent2 = row["sentence2"].strip()
+
+            add_to_samples(sent1, sent2, row["label"])
+            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite
+
+
+train_samples = []
+for sent1, others in train_data.items():
+    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+        train_samples.append(
+            InputExample(
+                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
+            )
+        )
+        train_samples.append(
+            InputExample(
+                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
+            )
+        )
+
+logging.info("Train samples: {}".format(len(train_samples)))
+
+
+# Special data loader that avoid duplicates within a batch
+train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)
+
+
+# Our training loss
+train_loss = losses.MultipleNegativesRankingLoss(model)
+train_loss = losses.AdaptiveLayerLoss(model, train_loss)
+
+stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
+dev_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_dev["sentence1"],
+    stsb_dev["sentence2"],
+    [score / 5 for score in stsb_dev["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-dev",
+)
+
+# Configure the training
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=dev_evaluator,
+    epochs=num_epochs,
+    evaluation_steps=int(len(train_dataloader) * 0.1),
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+    use_amp=False,  # Set to True, if your GPU supports FP16 operations
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+
+model = SentenceTransformer(model_save_path)
+stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
+test_evaluator = EmbeddingSimilarityEvaluator(
+    stsb_test["sentence1"],
+    stsb_test["sentence2"],
+    [score / 5 for score in stsb_test["score"]],
+    main_similarity=SimilarityFunction.COSINE,
+    name="sts-test",
+)
+test_evaluator(model, output_path=model_save_path)
+
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-nli-adaptive-layer")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-nli-adaptive-layer')`."
+    )
--- a/examples/training/adaptive_layer/adaptive_layer_sts.py
+++ b/examples/training/adaptive_layer/adaptive_layer_sts.py
+"""
+This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch.
+It uses AdaptiveLayerLoss with the powerful CoSENTLoss to train models that perform well at output dimensions [768, 512, 256, 128, 64].
+It generates sentence embeddings that can be compared using cosine-similarity to measure the similarity.
+
+Usage:
+python adaptive_layer_sts.py
+
+OR
+python adaptive_layer_sts.py pretrained_transformer_model_name
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import sys
+import os
+import gzip
+import csv
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
+model_name = sys.argv[1] if len(sys.argv) > 1 else "distilbert-base-uncased"
+
+# Read the dataset
+train_batch_size = 16
+num_epochs = 4
+model_save_path = (
+    "output/adaptive_layer_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
+word_embedding_model = models.Transformer(model_name)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
+train_loss = losses.CoSENTLoss(model=model)
+train_loss = losses.AdaptiveLayerLoss(model, train_loss)
+
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+
+# Configure the training. We skip evaluation in this example
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    evaluation_steps=1000,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+test_evaluator(model, output_path=model_save_path)
+
+# Optionally, save the model to the Hugging Face Hub!
+# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
+model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
+try:
+    model.push_to_hub(f"{model_name}-sts-adaptive-layer")
+except Exception:
+    logging.error(
+        "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
+        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
+        f"and saving it using `model.push_to_hub('{model_name}-sts-adaptive-layer')`."
+    )
--- a/examples/training/avg_word_embeddings/training_stsbenchmark_avg_word_embeddings.py
+++ b/examples/training/avg_word_embeddings/training_stsbenchmark_avg_word_embeddings.py
+"""
+This example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers (dense layers) to create a Deep Averaging Network (DAN).
+
+If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
+
+See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/
+for available word embeddings files
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import os
+import csv
+import gzip
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Read the dataset
+batch_size = 32
+model_save_path = "output/training_stsbenchmark_avg_word_embeddings-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+
+logging.info("Read STSbenchmark train dataset")
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+
+# Map tokens to traditional word embeddings like GloVe
+word_embedding_model = models.WordEmbeddings.from_text_file("glove.6B.300d.txt.gz")
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+# Add two trainable feed-forward networks (DAN)
+sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
+dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
+dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
+
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dan1, dan2])
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+num_epochs = 10
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+model.evaluate(evaluator)
--- a/examples/training/avg_word_embeddings/training_stsbenchmark_bilstm.py
+++ b/examples/training/avg_word_embeddings/training_stsbenchmark_bilstm.py
+"""
+This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
+for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
+
+Note, you can also pass BERT embeddings to the BiLSTM.
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import os
+import csv
+import gzip
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Read the dataset
+batch_size = 32
+model_save_path = "output/training_stsbenchmark_bilstm-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+# Map tokens to traditional word embeddings like GloVe
+word_embedding_model = models.WordEmbeddings.from_text_file("glove.6B.300d.txt.gz")
+
+lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=1024)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    lstm.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=False,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=True,
+)
+
+
+model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model])
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+num_epochs = 10
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+model.evaluate(evaluator)
--- a/examples/training/avg_word_embeddings/training_stsbenchmark_bow.py
+++ b/examples/training/avg_word_embeddings/training_stsbenchmark_bow.py
+"""
+This example uses a simple bag-of-words (BoW) approach. A sentence is mapped
+to a sparse vector with e.g. 25,000 dimensions. Optionally, you can also use tf-idf.
+
+To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN).
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+from sentence_transformers.models.tokenizer.WordTokenizer import ENGLISH_STOP_WORDS
+import logging
+from datetime import datetime
+import os
+import csv
+import gzip
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Read the dataset
+batch_size = 32
+model_save_path = "output/training_tf-idf_word_embeddings-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+##### Construction of the SentenceTransformer Model #####
+
+# Wikipedia document frequency for words
+wiki_doc_freq = "wikipedia_doc_frequencies.txt"
+if not os.path.exists(wiki_doc_freq):
+    util.http_get(
+        "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt", wiki_doc_freq
+    )
+
+# Create the vocab for the BoW model
+stop_words = ENGLISH_STOP_WORDS
+max_vocab_size = 25000  # This is also the size of the BoW sentence vector.
+
+
+# Read the most common max_vocab_size words. Skip stop-words
+vocab = set()
+weights = {}
+lines = open("wikipedia_doc_frequencies.txt", encoding="utf8").readlines()
+num_docs = int(lines[0])
+for line in lines[1:]:
+    word, freq = line.lower().strip().split("\t")
+    if word in stop_words:
+        continue
+
+    vocab.add(word)
+    weights[word] = math.log(num_docs / int(freq))
+
+    if len(vocab) >= max_vocab_size:
+        break
+
+##### Construction of the SentenceTransformer Model #####
+
+# Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we
+# get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding
+bow = models.BoW(vocab=vocab, word_weights=weights, cumulative_term_frequency=True)
+
+# Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions.
+sent_embeddings_dimension = max_vocab_size
+dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768)
+dan2 = models.Dense(in_features=768, out_features=512)
+
+model = SentenceTransformer(modules=[bow, dan1, dan2])
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+num_epochs = 10
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+model.evaluate(evaluator)
--- a/examples/training/avg_word_embeddings/training_stsbenchmark_cnn.py
+++ b/examples/training/avg_word_embeddings/training_stsbenchmark_cnn.py
+"""
+This example runs a CNN after the word embedding lookup. The output of the CNN is than pooled,
+for example with mean-pooling.
+
+
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import os
+import csv
+import gzip
+
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Read the dataset
+batch_size = 32
+model_save_path = "output/training_stsbenchmark_cnn-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+# Map tokens to vectors using BERT
+word_embedding_model = models.Transformer("bert-base-uncased")
+
+cnn = models.CNN(
+    in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(),
+    out_channels=256,
+    kernel_sizes=[1, 3, 5],
+)
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    cnn.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+
+model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+num_epochs = 10
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+model.evaluate(evaluator)
--- a/examples/training/avg_word_embeddings/training_stsbenchmark_tf-idf_word_embeddings.py
+++ b/examples/training/avg_word_embeddings/training_stsbenchmark_tf-idf_word_embeddings.py
+"""
+This example weights word embeddings (like GloVe) with IDF weights. The IDF weights can for example be computed on Wikipedia.
+
+If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
+
+See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/ for available word embeddings files
+
+You can get term-document frequencies from here:
+https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt
+"""
+
+from torch.utils.data import DataLoader
+import math
+from sentence_transformers import models, losses, util
+from sentence_transformers import LoggingHandler, SentenceTransformer
+from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
+from sentence_transformers.readers import InputExample
+import logging
+from datetime import datetime
+import os
+import csv
+import gzip
+
+#### Just some code to print debug information to stdout
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+# Read the dataset
+batch_size = 32
+model_save_path = "output/training_tf-idf_word_embeddings-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+# Wikipedia document frequency for words
+wiki_doc_freq = "wikipedia_doc_frequencies.txt"
+if not os.path.exists(wiki_doc_freq):
+    util.http_get(
+        "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt", wiki_doc_freq
+    )
+
+# Check if dataset exists. If not, download and extract  it
+sts_dataset_path = "datasets/stsbenchmark.tsv.gz"
+
+if not os.path.exists(sts_dataset_path):
+    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
+
+logging.info("Read STSbenchmark train dataset")
+
+train_samples = []
+dev_samples = []
+test_samples = []
+with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
+    for row in reader:
+        score = float(row["score"]) / 5.0  # Normalize score to range 0 ... 1
+        inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
+
+        if row["split"] == "dev":
+            dev_samples.append(inp_example)
+        elif row["split"] == "test":
+            test_samples.append(inp_example)
+        else:
+            train_samples.append(inp_example)
+
+##### Construction of the SentenceTransformer Model #####
+
+# Map tokens to traditional word embeddings like GloVe
+word_embedding_model = models.WordEmbeddings.from_text_file("glove.6B.300d.txt.gz")
+
+# Weight word embeddings using Inverse-Document-Frequency (IDF) values.
+# For each word in the vocab ob the tokenizer, we must specify a weight value.
+# The word embedding is then multiplied by this value
+vocab = word_embedding_model.tokenizer.get_vocab()
+word_weights = {}
+lines = open(wiki_doc_freq, encoding="utf8").readlines()
+num_docs = int(lines[0])
+for line in lines[1:]:
+    word, freq = line.strip().split("\t")
+    word_weights[word] = math.log(num_docs / int(freq))
+
+# Words in the vocab that are not in the doc_frequencies file get a frequency of 1
+unknown_word_weight = math.log(num_docs / 1)
+
+# Initialize the WordWeights model. This model must be between the WordEmbeddings and the Pooling model
+word_weights = models.WordWeights(vocab=vocab, word_weights=word_weights, unknown_word_weight=unknown_word_weight)
+
+
+# Apply mean pooling to get one fixed sized sentence vector
+pooling_model = models.Pooling(
+    word_embedding_model.get_word_embedding_dimension(),
+    pooling_mode_mean_tokens=True,
+    pooling_mode_cls_token=False,
+    pooling_mode_max_tokens=False,
+)
+
+# Add two trainable feed-forward networks (DAN)
+sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
+dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
+dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
+
+model = SentenceTransformer(modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])
+
+
+# Convert the dataset to a DataLoader ready for training
+logging.info("Read STSbenchmark train dataset")
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
+train_loss = losses.CosineSimilarityLoss(model=model)
+
+logging.info("Read STSbenchmark dev dataset")
+evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev")
+
+# Configure the training
+num_epochs = 10
+warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
+logging.info("Warmup-steps: {}".format(warmup_steps))
+
+# Train the model
+model.fit(
+    train_objectives=[(train_dataloader, train_loss)],
+    evaluator=evaluator,
+    epochs=num_epochs,
+    warmup_steps=warmup_steps,
+    output_path=model_save_path,
+)
+
+
+##############################################################################
+#
+# Load the stored model and evaluate its performance on STS benchmark dataset
+#
+##############################################################################
+
+model = SentenceTransformer(model_save_path)
+test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
+model.evaluate(evaluator)