import evaluate import datasets def strip(resps, docs): """ Assuming each entry of `resps` is a list of model responses, we discard all but the first response. """ return map(lambda r: r[0].strip(), resps) def dr_ar(dataset: datasets.Dataset): return dataset.filter(lambda x: x["direction"] == "dr_ar") def ar_dr(dataset: datasets.Dataset): return dataset.filter(lambda x: x["direction"] == "ar_dr") def doc_to_text(doc): doc_text = doc["messages"][0]["content"] return doc_text def doc_to_target(doc): return doc["messages"][1]["content"] def bert(items): return items def Average(lst): return sum(lst) / len(lst) def arabizibert(items): bert_model = "SI2M-Lab/DarijaBERT-arabizi" bert_score = evaluate.load("bertscore") predictions, references = zip(*items) bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) return Average(bert['f1']) def darijabert(items): bert_model = 'SI2M-Lab/DarijaBERT' bert_score = evaluate.load("bertscore") predictions, references = zip(*items) bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) return Average(bert['f1']) def mbert(items): bert_model = 'google-bert/bert-base-multilingual-cased' bert_score = evaluate.load("bertscore") predictions, references = zip(*items) bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) return Average(bert['f1'])