"""
This script contains an example how to extend an existent sentence embedding model to new languages.

Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
on multiple languages.

For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
with the first column a sentence in a language understood by the teacher model, e.g. English,
and the further columns contain the according translations for languages you want to extend to.

This scripts downloads automatically the parallel sentences corpus. This corpus contains transcripts from
talks translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts

Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""

from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime

import os
import logging
import sentence_transformers.util
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import zipfile
import io

logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)


teacher_model_name = (
    "paraphrase-distilroberta-base-v2"  # Our monolingual teacher model, we want to convert to multiple languages
)
student_model_name = "xlm-roberta-base"  # Multilingual base model we use to imitate the teacher model

max_seq_length = 128  # Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64  # Batch size for training
inference_batch_size = 64  # Batch size at inference
max_sentences_per_language = 500000  # Maximum number of  parallel sentences for training
train_max_sentence_length = 250  # Maximum length (characters) for parallel training sentences

num_epochs = 5  # Train for x epochs
num_warmup_steps = 10000  # Warumup steps

num_evaluation_steps = 1000  # Evaluate performance after every xxxx steps
dev_sentences = 1000  # Number of parallel sentences to be used for development


# Define the language codes you would like to extend the model to
source_languages = set(["en"])  # Our teacher model accepts English (en) sentences
target_languages = set(
    ["de", "es", "it", "fr", "ar", "tr"]
)  # We want to extend the model to these new languages. For language codes, see the header of the train file


output_path = (
    "output/make-multilingual-"
    + "-".join(sorted(list(source_languages)) + sorted(list(target_languages)))
    + "-"
    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)


# This function downloads a corpus if it does not exist
def download_corpora(filepaths):
    if not isinstance(filepaths, list):
        filepaths = [filepaths]

    for filepath in filepaths:
        if not os.path.exists(filepath):
            print(filepath, "does not exists. Try to download from server")
            filename = os.path.basename(filepath)
            url = "https://sbert.net/datasets/" + filename
            sentence_transformers.util.http_get(url, filepath)


# Here we define train train and dev corpora
train_corpus = "datasets/parallel-sentences.tsv.gz"
sts_corpus = "datasets/stsbenchmark.zip"
parallel_sentences_folder = "parallel-sentences/"

# Check if the file exists. If not, they are downloaded
download_corpora([train_corpus, sts_corpus])


# Create parallel files for the selected language combinations
os.makedirs(parallel_sentences_folder, exist_ok=True)
train_files = []
dev_files = []
files_to_create = []
for source_lang in source_languages:
    for target_lang in target_languages:
        output_filename_train = os.path.join(
            parallel_sentences_folder, "talks-{}-{}-train.tsv.gz".format(source_lang, target_lang)
        )
        output_filename_dev = os.path.join(
            parallel_sentences_folder, "talks-{}-{}-dev.tsv.gz".format(source_lang, target_lang)
        )
        train_files.append(output_filename_train)
        dev_files.append(output_filename_dev)
        if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
            files_to_create.append(
                {
                    "src_lang": source_lang,
                    "trg_lang": target_lang,
                    "fTrain": gzip.open(output_filename_train, "wt", encoding="utf8"),
                    "fDev": gzip.open(output_filename_dev, "wt", encoding="utf8"),
                    "devCount": 0,
                }
            )

if len(files_to_create) > 0:
    print(
        "Parallel sentences files {} do not exist. Create these files now".format(
            ", ".join(map(lambda x: x["src_lang"] + "-" + x["trg_lang"], files_to_create))
        )
    )
    with gzip.open(train_corpus, "rt", encoding="utf8") as fIn:
        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
        for line in tqdm(reader, desc="Sentences"):
            for outfile in files_to_create:
                src_text = line[outfile["src_lang"]].strip()
                trg_text = line[outfile["trg_lang"]].strip()

                if src_text != "" and trg_text != "":
                    if outfile["devCount"] < dev_sentences:
                        outfile["devCount"] += 1
                        fOut = outfile["fDev"]
                    else:
                        fOut = outfile["fTrain"]

                    fOut.write("{}\t{}\n".format(src_text, trg_text))

    for outfile in files_to_create:
        outfile["fTrain"].close()
        outfile["fDev"].close()


######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)


logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(
    student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True
)
for train_file in train_files:
    train_data.load_data(
        train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length
    )

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)


#### Evaluate cross-lingual performance on different tasks #####
evaluators = []  # evaluators has a list of different evaluator classes we call periodically

for dev_file in dev_files:
    logger.info("Create evaluator for " + dev_file)
    src_sentences = []
    trg_sentences = []
    with gzip.open(dev_file, "rt", encoding="utf8") as fIn:
        for line in fIn:
            splits = line.strip().split("\t")
            if splits[0] != "" and splits[1] != "":
                src_sentences.append(splits[0])
                trg_sentences.append(splits[1])

    # Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
    dev_mse = evaluation.MSEEvaluator(
        src_sentences,
        trg_sentences,
        name=os.path.basename(dev_file),
        teacher_model=teacher_model,
        batch_size=inference_batch_size,
    )
    evaluators.append(dev_mse)

    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
    dev_trans_acc = evaluation.TranslationEvaluator(
        src_sentences, trg_sentences, name=os.path.basename(dev_file), batch_size=inference_batch_size
    )
    evaluators.append(dev_trans_acc)


##### Read cross-lingual Semantic Textual Similarity (STS) data ####
all_languages = list(set(list(source_languages) + list(target_languages)))
sts_data = {}

# Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
with zipfile.ZipFile(sts_corpus) as zip:
    filelist = zip.namelist()
    sts_files = []

    for i in range(len(all_languages)):
        for j in range(i, len(all_languages)):
            lang1 = all_languages[i]
            lang2 = all_languages[j]
            filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)
            if filepath not in filelist:
                lang1, lang2 = lang2, lang1
                filepath = "STS2017-extended/STS.{}-{}.txt".format(lang1, lang2)

            if filepath in filelist:
                filename = os.path.basename(filepath)
                sts_data[filename] = {"sentences1": [], "sentences2": [], "scores": []}

                fIn = zip.open(filepath)
                for line in io.TextIOWrapper(fIn, "utf8"):
                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]["sentences1"].append(sent1)
                    sts_data[filename]["sentences2"].append(sent2)
                    sts_data[filename]["scores"].append(score)

for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
        data["sentences1"],
        data["sentences2"],
        data["scores"],
        batch_size=inference_batch_size,
        name=filename,
        show_progress_bar=False,
    )
    evaluators.append(test_evaluator)


# Train the model
student_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
    epochs=num_epochs,
    warmup_steps=num_warmup_steps,
    evaluation_steps=num_evaluation_steps,
    output_path=output_path,
    save_best_model=True,
    optimizer_params={"lr": 2e-5, "eps": 1e-6},
)