Unverified Commit 71f2954b authored by Vladislav Mikhailov's avatar Vladislav Mikhailov Committed by GitHub
Browse files

Added NorEval, a novel Norwegian benchmark (#2919)

* added noreval

* added a checklist for noreval

* run pre-commit

* changed imports and added short noreval description

* fixed norsumm path

* refactored multi-folder tasks

* refactored multi-folder tasks
parent ab618f01
import datasets
import numpy as np
import sacrebleu
from rouge_score import rouge_scorer, scoring
try:
import sacrebleu
from rouge_score import rouge_scorer, scoring
except ModuleNotFoundError as e:
raise type(e)(
"`sacrebleu` and `rouge_score` are required for evaluating the model on NorEval."
) from e
ROUGE_SCORER = None
def preprocess_function(examples):
def _format_answers(answers):
formatted_answers = []
for answer in answers:
answer = answer.strip()
if len(answer):
# Add a period after all answers.
if answer[-1] != ".":
formatted_answers.append(answer + ".")
else:
formatted_answers.append(answer)
return formatted_answers
incorrect_answers = _format_answers(examples["incorrect_answers"])
correct_answers = _format_answers(examples["correct_answers"])
return {
"question": examples["question"].strip(),
"correct_answers": correct_answers,
"incorrect_answers": incorrect_answers,
}
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
return dataset.map(preprocess_function)
def process_results(doc, results):
completion = results[0]
true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
all_refs = true_refs + false_refs
# BLEU
bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
bleu_max = bleu_correct
bleu_diff = bleu_correct - bleu_incorrect
bleu_acc = int(bleu_correct > bleu_incorrect)
# ROUGE-N
rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
# ROUGE-1
rouge1_scores = [score["rouge1"] for score in rouge_scores]
rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
rouge1_max = rouge1_correct
rouge1_diff = rouge1_correct - rouge1_incorrect
rouge1_acc = int(rouge1_correct > rouge1_incorrect)
# ROUGE-2
rouge2_scores = [score["rouge2"] for score in rouge_scores]
rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
rouge2_max = rouge2_correct
rouge2_diff = rouge2_correct - rouge2_incorrect
rouge2_acc = int(rouge2_correct > rouge2_incorrect)
# ROUGE-L
rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
rougeL_max = rougeL_correct
rougeL_diff = rougeL_correct - rougeL_incorrect
rougeL_acc = int(rougeL_correct > rougeL_incorrect)
return {
"bleu_max": bleu_max,
"bleu_acc": bleu_acc,
"bleu_diff": bleu_diff,
"rouge1_max": rouge1_max,
"rouge1_acc": rouge1_acc,
"rouge1_diff": rouge1_diff,
"rouge2_max": rouge2_max,
"rouge2_acc": rouge2_acc,
"rouge2_diff": rouge2_diff,
"rougeL_max": rougeL_max,
"rougeL_acc": rougeL_acc,
"rougeL_diff": rougeL_diff,
}
def bleu(refs, preds):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score = sacrebleu.corpus_bleu(
preds,
refs,
smooth_method="exp",
smooth_value=0.0,
force=False,
lowercase=False,
tokenize="intl",
use_effective_order=False,
).score
return score
def rouge(refs, preds):
"""
Returns `t5` style ROUGE scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
:param refs:
A `list` of reference `strs`.
:param preds:
A `list` of predicted `strs`.
"""
rouge_types = ["rouge1", "rouge2", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types)
# Add newlines between sentences to correctly compute `rougeLsum`.
global ROUGE_SCORER
if ROUGE_SCORER is None:
# init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant
ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types)
scorer = ROUGE_SCORER
def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n")
return summary
# Accumulate confidence intervals.
aggregator = scoring.BootstrapAggregator()
for ref, pred in zip(refs, preds):
ref = _prepare_summary(ref)
pred = _prepare_summary(pred)
aggregator.add_scores(scorer.score(ref, pred))
result = aggregator.aggregate()
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
dataset_path: ltg/nortruthfulqa_mc
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
num_fewshot: 0
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: nortruthfulqa_mc_nno
task: nortruthfulqa_mc_nno_p0
dataset_name: nn
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p0_nn
tag: nortruthfulqa_mc_nno
task: nortruthfulqa_mc_nno_p1
dataset_name: nn
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p1_nn
tag: nortruthfulqa_mc_nno
task: nortruthfulqa_mc_nno_p2
dataset_name: nn
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p2_nn
target_delimiter: "\n"
tag: nortruthfulqa_mc_nno
task: nortruthfulqa_mc_nno_p3
dataset_name: nn
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p3_nn
target_delimiter: "\n"
tag: nortruthfulqa_mc_nno
task: nortruthfulqa_mc_nno_p4
dataset_name: nn
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p4_nn
def p0_nn(doc):
prompt = "Spørsmål: {question}\n\nSvar:"
return prompt.format(question=doc["question"])
def p1_nn(doc):
prompt = "Spørsmål: {question}\n\nSvaralternativ:{choices}\n\nSvar:"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
def p2_nn(doc):
prompt = "Spørsmål: {question}\n\nKva av følgande alternativ er rett svar på spørsmålet?{choices}"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
def p3_nn(doc):
prompt = "Gitt følgande spørsmål, kva av dei moglege svara under er rett?\nSpørsmål: {question}\n{choices}"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
def p4_nn(doc):
prompt = "{question}\nVel eit av følgande moglege svar:{choices}\n\nSvar:"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
tag: nortruthfulqa_mc_nob
task: nortruthfulqa_mc_nob_p0
dataset_name: nb
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p0_nb
tag: nortruthfulqa_mc_nob
task: nortruthfulqa_mc_nob_p1
dataset_name: nb
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p1_nb
tag: nortruthfulqa_mc_nob
task: nortruthfulqa_mc_nob_p2
dataset_name: nb
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p2_nb
target_delimiter: "\n"
tag: nortruthfulqa_mc_nob
task: nortruthfulqa_mc_nob_p3
dataset_name: nb
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p3_nb
target_delimiter: "\n"
tag: nortruthfulqa_mc_nob
task: nortruthfulqa_mc_nob_p4
dataset_name: nb
include: ../_nortruthfulqa_mc_yaml
doc_to_text: !function utils.p4_nb
def p0_nb(doc):
prompt = "Spørsmål: {question}\n\nSvar:"
return prompt.format(question=doc["question"])
def p1_nb(doc):
prompt = "Spørsmål: {question}\n\nSvaralternativer:{choices}\n\nSvar:"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
def p2_nb(doc):
prompt = "Spørsmål: {question}\n\nHvilke av følgende alternativer er riktig svar på spørsmålet?{choices}"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
def p3_nb(doc):
prompt = "Gitt følgende spørsmål, hvilket av de mulige svarene under er riktig?\nSpørsmål: {question}\n{choices}"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
def p4_nb(doc):
prompt = "{question}\nVelg et av følgende mulige svar:{choices}\n\nSvar:"
choices = "".join(
list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
)
return prompt.format(question=doc["question"], choices=choices)
dataset_path: ltg/nrk_quiz_qa
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
num_fewshot: 0
doc_to_target: "{{choices.label.index(answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: nrk_quiz_qa_nno
task: nrk_quiz_qa_nno_p0
dataset_name: nn
include: ../_nrk_quiz_qa_yaml
doc_to_text: !function utils.p0_nn
doc_to_choice: "{{choices.text}}"
tag: nrk_quiz_qa_nno
task: nrk_quiz_qa_nno_p1
dataset_name: nn
include: ../_nrk_quiz_qa_yaml
doc_to_text: !function utils.p1_nn
doc_to_choice: "{{choices.text}}"
tag: nrk_quiz_qa_nno
task: nrk_quiz_qa_nno_p2
dataset_name: nn
include: ../_nrk_quiz_qa_yaml
doc_to_text: !function utils.p2_nn
doc_to_choice: "{{choices.label}}"
tag: nrk_quiz_qa_nno
task: nrk_quiz_qa_nno_p3
dataset_name: nn
include: ../_nrk_quiz_qa_yaml
doc_to_text: !function utils.p3_nn
doc_to_choice: "{{choices.label}}"
tag: nrk_quiz_qa_nno
task: nrk_quiz_qa_nno_p4
dataset_name: nn
include: ../_nrk_quiz_qa_yaml
doc_to_text: !function utils.p4_nn
doc_to_choice: "{{choices.text}}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment