Unverified Commit 71f2954b authored by Vladislav Mikhailov's avatar Vladislav Mikhailov Committed by GitHub
Browse files

Added NorEval, a novel Norwegian benchmark (#2919)

* added noreval

* added a checklist for noreval

* run pre-commit

* changed imports and added short noreval description

* fixed norsumm path

* refactored multi-folder tasks

* refactored multi-folder tasks
parent ab618f01
tag: norsumm_nno
dataset_name: nn
task: norsumm_nno_p5
include: ../_norsumm_yaml
doc_to_text: "Heile artikkelen:\n{{article}}\n\nHovudpunkt:"
tag: norsumm_nob
dataset_name: nb
task: norsumm_nob_p0
include: ../_norsumm_yaml
doc_to_text: "Skriv en oppsummering av følgende artikkel med kun noen punkter: {{article}}\nOppsummering:"
tag: norsumm_nob
dataset_name: nb
task: norsumm_nob_p1
include: ../_norsumm_yaml
doc_to_text: "Oppsummer følgende artikkel med noen setninger: {{article}}\nOppsummering:"
tag: norsumm_nob
dataset_name: nb
task: norsumm_nob_p2
include: ../_norsumm_yaml
doc_to_text: "{{article}}\nSkriv en kort og presis oppsummering av teksten over. Språket være klart og lett å forstå. Sørg for å ikke introdusere feil. Oppsummeringen dekke følgende spørsmål: hvem, hva, hvor, når, og hvorfor er denne saken viktig å vite om. Oppsummeringen være engasjerende og fremheve nøkkelinformasjon fra artikkelen. Oppsummeringen skal inneholde maksimalt 700 tegn, inkludert mellomrom."
target_delimiter: "\n"
tag: norsumm_nob
dataset_name: nb
task: norsumm_nob_p3
include: ../_norsumm_yaml
doc_to_text: "Gi et kortfattet sammendrag av følgende tekst: {{article}}"
target_delimiter: "\n"
tag: norsumm_nob
dataset_name: nb
task: norsumm_nob_p4
include: ../_norsumm_yaml
doc_to_text: "Lag en kort oppsummering som sammenfatter den følgende teksten i noen punkter:\n{{article}}\n\nOppsummering:"
tag: norsumm_nob
dataset_name: nb
task: norsumm_nob_p5
include: ../_norsumm_yaml
doc_to_text: "Hele artikkelen:\n{{article}}\n\nHovedpunkter:"
import datasets
import numpy as np
from evaluate import load
try:
import bert_score
import sacrebleu
from rouge_score import rouge_scorer, scoring
except ModuleNotFoundError as e:
raise type(e)(
"`sacrebleu`, `bert_score`, and `rouge_score` are required for evaluating the model on NorEval."
) from e
ROUGE_SCORER = None
BERTSCORE = None
def process_results(doc, results):
completion = results[0]
references = doc["summaries"]
bleu_scores = [bleu([[reference]], [completion]) for reference in references]
bleu_max = np.nanmax(bleu_scores)
bleu_avg = np.nanmean(bleu_scores)
rouge_scores = [rouge([reference], [completion]) for reference in references]
rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
rougeL_max = np.nanmax(rougeL_scores)
rougeL_avg = np.nanmean(rougeL_scores)
bertscore_f1s = [
bertscore_f1(references=[reference], predictions=[completion])
for reference in references
]
bertscore_f1_max = np.nanmax(bertscore_f1s)
bertscore_f1_avg = np.nanmean(bertscore_f1s)
return {
"bleu_max": bleu_max,
"bleu_avg": bleu_avg,
"rougeL_max": rougeL_max,
"rougeL_avg": rougeL_avg,
"bertscore_f1_max": bertscore_f1_max,
"bertscore_f1_avg": bertscore_f1_avg,
}
def bleu(refs, preds):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score = sacrebleu.corpus_bleu(
preds,
refs,
smooth_method="exp",
smooth_value=0.0,
force=False,
lowercase=False,
tokenize="intl",
use_effective_order=False,
).score
return score
def rouge(refs, preds):
"""
Returns `t5` style ROUGE scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
:param refs:
A `list` of reference `strs`.
:param preds:
A `list` of predicted `strs`.
"""
rouge_types = ["rougeLsum"]
global ROUGE_SCORER
if ROUGE_SCORER is None:
# init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant
ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types)
scorer = ROUGE_SCORER
# Add newlines between sentences to correctly compute `rougeLsum`.
def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n")
return summary
# Accumulate confidence intervals.
aggregator = scoring.BootstrapAggregator()
for ref, pred in zip(refs, preds):
ref = _prepare_summary(ref)
pred = _prepare_summary(pred)
aggregator.add_scores(scorer.score(ref, pred))
result = aggregator.aggregate()
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
def bertscore_f1(references, predictions):
"""Computes the F1 score of the BERTScore metric.
Args:
references: A list of reference strings.
predictions: A list of predicted strings.
**kwargs: Additional keyword arguments.
Returns:
The F1 score of the BERTScore metric.
"""
global BERTSCORE
if BERTSCORE is None:
# init BERTScore once
BERTSCORE = load("bertscore")
bertscore = BERTSCORE
return bertscore.compute(
predictions=predictions,
references=references,
model_type="bert-base-multilingual-cased",
num_layers=9,
)["f1"][0]
task: norsummarize_instruct
dataset_path: ltg/norsummarize-instruct
training_split: null
test_split: test
doc_to_text: "{{prompt}} {{context}}"
doc_to_target: response
output_type: generate_until
metric_list:
- metric: bleu
higher_is_better: true
- metric: chrf
higher_is_better: true
generation_kwargs:
until:
- "\n"
do_sample: false
num_beams: 1
max_new_tokens: 256
metadata:
version: 1.0
dataset_path: ltg/nortruthfulqa_gen
output_type: generate_until
training_split: null
validation_split: validation
test_split: null
num_fewshot: 0
doc_to_target: " "
process_docs: !function utils.process_docs
process_results: !function utils.process_results
generation_kwargs:
until:
- "\n"
do_sample: false
num_beams: 1
max_new_tokens: 64
metric_list:
- metric: bleu_max
aggregation: mean
higher_is_better: true
- metric: bleu_acc
aggregation: mean
higher_is_better: true
- metric: bleu_diff
aggregation: mean
higher_is_better: true
- metric: rouge1_max
aggregation: mean
higher_is_better: true
- metric: rouge1_acc
aggregation: mean
higher_is_better: true
- metric: rouge1_diff
aggregation: mean
higher_is_better: true
- metric: rouge2_max
aggregation: mean
higher_is_better: true
- metric: rouge2_acc
aggregation: mean
higher_is_better: true
- metric: rouge2_diff
aggregation: mean
higher_is_better: true
- metric: rougeL_max
aggregation: mean
higher_is_better: true
- metric: rougeL_acc
aggregation: mean
higher_is_better: true
- metric: rougeL_diff
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: nortruthfulqa_gen_nno
task: nortruthfulqa_gen_nno_p0
dataset_name: nn
include: ../_nortruthfulqa_gen_yaml
doc_to_text: question
tag: nortruthfulqa_gen_nno
task: nortruthfulqa_gen_nno_p1
dataset_name: nn
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "Spørsmål: {{question}}\nSvar:"
tag: nortruthfulqa_gen_nno
task: nortruthfulqa_gen_nno_p2
dataset_name: nn
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "Skriv svaret følgande spørsmål: {{question}}\nSvar:"
tag: nortruthfulqa_gen_nno
task: nortruthfulqa_gen_nno_p3
dataset_name: nn
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "{{question}}\nKva er rett svar spørsmålet?\nSvar:"
tag: nortruthfulqa_gen_nno
task: nortruthfulqa_gen_nno_p4
dataset_name: nn
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "Svar sant følgande: {{question}}\nSvar:"
tag: nortruthfulqa_gen_nob
task: nortruthfulqa_gen_nob_p0
dataset_name: nb
include: ../_nortruthfulqa_gen_yaml
doc_to_text: question
tag: nortruthfulqa_gen_nob
task: nortruthfulqa_gen_nob_p1
dataset_name: nb
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "Spørsmål: {{question}}\nSvar:"
tag: nortruthfulqa_gen_nob
task: nortruthfulqa_gen_nob_p2
dataset_name: nb
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "Skriv svaret følgende spørsmål: {{question}}\nSvar:"
tag: nortruthfulqa_gen_nob
task: nortruthfulqa_gen_nob_p3
dataset_name: nb
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "{{question}}\nHva er riktig svar spørsmålet?\nSvar:"
tag: nortruthfulqa_gen_nob
task: nortruthfulqa_gen_nob_p4
dataset_name: nb
include: ../_nortruthfulqa_gen_yaml
doc_to_text: "Svar sant følgende: {{question}}\nSvar:"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment