Unverified Commit 71f2954b authored by Vladislav Mikhailov's avatar Vladislav Mikhailov Committed by GitHub
Browse files

Added NorEval, a novel Norwegian benchmark (#2919)

* added noreval

* added a checklist for noreval

* run pre-commit

* changed imports and added short noreval description

* fixed norsumm path

* refactored multi-folder tasks

* refactored multi-folder tasks
parent ab618f01
import numpy as np
import sklearn
def multi_f1(items):
"""
Computes the macro-average F1 score.
"""
preds, golds = zip(*items)
preds = np.array(preds)
golds = np.array(golds)
fscore = sklearn.metrics.f1_score(golds, preds, average="macro")
return fscore
dataset_path: Sprakbanken/Norwegian_idioms
training_split: null
validation_split: null
test_split: test
num_fewshot: 0
output_type: generate_until
doc_to_target: completion
process_results: !function utils.process_results
generation_kwargs:
until:
- "\n"
do_sample: false
num_beams: 1
max_new_tokens: 16
metric_list:
- metric: em
aggregation: mean
higher_is_better: true
- metric: fscore
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: noridiom_nno
task: noridiom_nno_p0
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nn
doc_to_text: "Fullfør dette uttrykket: {{idiom_start}}"
tag: noridiom_nno
task: noridiom_nno_p1
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nn
doc_to_text: "Skriv fortsetjinga av idiomet {{idiom_start}}"
tag: noridiom_nno
task: noridiom_nno_p2
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nn
doc_to_text: "Korleis fortset uttrykket \"{{idiom_start}}\"?"
tag: noridiom_nno
task: noridiom_nno_p3
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nn
doc_to_text: "Fullfør vendinga: {{idiom_start}}"
tag: noridiom_nno
task: noridiom_nno_p4
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nn
doc_to_text: "{{idiom_start}}"
tag: noridiom_nob
task: noridiom_nob_p0
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nb
doc_to_text: "Fullfør dette uttrykket: {{idiom_start}}"
tag: noridiom_nob
task: noridiom_nob_p1
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nb
doc_to_text: "Skriv fortsettelsen av idiomet {{idiom_start}}"
tag: noridiom_nob
task: noridiom_nob_p2
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nb
doc_to_text: "Hvordan fortsetter uttrykket \"{{idiom_start}}\"?"
tag: noridiom_nob
task: noridiom_nob_p3
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nb
doc_to_text: "Fullfør vendingen \"{{idiom_start}}\""
tag: noridiom_nob
task: noridiom_nob_p4
include: ../_noridiom_yaml
process_docs: !function ../utils.filter_dataset_nb
doc_to_text: "{{idiom_start}}"
from collections import Counter
from string import punctuation
import numpy as np
def normalize(text):
exclude = set(punctuation)
return "".join(ch for ch in text if ch not in exclude).lower().strip()
def f1(prediction, completion):
gold_toks = completion.split()
pred_toks = prediction.split()
common = Counter(gold_toks) & Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def process_results(doc, results):
prediction = normalize(results[0])
completions = [normalize(completion) for completion in doc["accepted_completions"]]
exact_match = np.nanmax(
[int(prediction == completion) for completion in completions]
)
fscore = np.nanmax(
[f1(prediction=prediction, completion=completion) for completion in completions]
)
return {"em": exact_match, "fscore": fscore}
def filter_dataset_nb(dataset):
return dataset.filter(lambda example: example["language"] == "nob")
def filter_dataset_nn(dataset):
return dataset.filter(lambda example: example["language"] == "nno")
dataset_path: ltg/noropenbookqa
output_type: multiple_choice
training_split: train
validation_split: null
test_split: test
process_docs: !function utils.filter_dataset
doc_to_target: "{{choices.label.index(answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: noropenbookqa_nno
task: noropenbookqa_nno_p0
dataset_name: nn
include: ../_noropenbookqa_yaml
doc_to_text: "{{fact}}\n{{question_stem}}"
doc_to_choice: "{{choices.text}}"
tag: noropenbookqa_nno
task: noropenbookqa_nno_p1
dataset_name: nn
include: ../_noropenbookqa_yaml
doc_to_text: "Faktatekst: {{fact}}\nSpørsmål til teksten: {{question_stem}}\n\nSvaralternativer:\n- {{choices.text[0]}}\n- {{choices.text[1]}}\n- {{choices.text[2]}}\n- {{choices.text[3]}}\n\nKva er rett svar?"
doc_to_choice: "{{choices.text}}"
tag: noropenbookqa_nno
task: noropenbookqa_nno_p2
dataset_name: nn
include: ../_noropenbookqa_yaml
doc_to_text: "{{fact}}\n{{question_stem}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\n\nEr det rette svaret A, B, C, eller D?\n\nSvar:"
doc_to_choice: "{{choices.label}}"
tag: noropenbookqa_nno
task: noropenbookqa_nno_p3
dataset_name: nn
include: ../_noropenbookqa_yaml
doc_to_text: "Bakgrunn: {{fact}}\n\nSpørsmål: {{question_stem}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\n\nSvar:"
doc_to_choice: "{{choices.label}}"
tag: noropenbookqa_nno
task: noropenbookqa_nno_p4
dataset_name: nn
include: ../_noropenbookqa_yaml
doc_to_text: "Ta utgangspunkt i følgande fakta når du svarar spørsmålet: {{fact}}\n\n{{question_stem}}\nVel rett svar blant desse alternativa:\n {{choices.text[0]}}\n {{choices.text[1]}}\n {{choices.text[2]}}\n {{choices.text[3]}}\n\nSvar:"
doc_to_choice: "{{choices.text}}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment