Unverified Commit 0e763862 authored by zxcvuser's avatar zxcvuser Committed by GitHub
Browse files

Add new benchmark: Galician bench (#2155)

* Add galician_bench

* Update xnli_gl path

* Add flores_gl group

* Update _flores_common_yaml

* Updated some task groupings and readme

---------
parent ea17b98e
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_it-gl
doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
Galician sentence:'
doc_to_target: '{{sentence_glg_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_pt-gl
doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
Galician sentence:'
doc_to_target: '{{sentence_glg_Latn}}'
task: galcola
dataset_path: proxectonos/galcola
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "{{sentence}}\nPregunta: Ten sentido esta frase?\nResposta:"
doc_to_target: label
doc_to_choice: ["non", "si"]
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: mcc
- metric: acc
metadata:
version: 1.0
group: galician_bench
task:
- belebele_glg_Latn
- flores_gl
- galcola
- summarization_gl
- parafrases_gl
- paws_gl
- openbookqa_gl
- mgsm_direct_gl
- truthfulqa_gl
- xnli_gl
- xstorycloze_gl
metadata:
version: 1.0
task: mgsm_direct_gl
dataset_path: proxectonos/mgsm_gl
doc_to_target: '{{answer_number|string}}'
doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}'
output_type: generate_until
training_split: train
test_split: test
target_delimiter: ""
generation_kwargs:
until:
- "\n\n"
- "\n"
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
# Task configuration directly taken from Eleuther AI's implementation as of March 22, 2024
task: openbookqa_gl
dataset_path: proxectonos/openbookqa_gl
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
doc_to_text: question_stem
doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: question_stem
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: parafrases_gl
dataset_path: proxectonos/parafrases_gl
dataset_name: null
training_split: train
validation_split: validation
test_split: test
output_type: multiple_choice
doc_to_text: ""
doc_to_target: '{{0 if Avaliación == 0 else 1}}'
process_docs: !function utils.process_docs_paraphrases
doc_to_choice: '{{[Frase+", verdadeiro? Non, "+Paráfrase, Frase+", verdadeiro? Si, "+Paráfrase]}}'
target_delimiter: ""
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: paws_gl
dataset_path: proxectonos/PAWS-gl
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
process_docs: !function utils.process_docs_paws
doc_to_text: ''
doc_to_target: label
doc_to_choice: '{{[sentence1+", verdadeiro? Non, "+sentence2, sentence1+", verdadeiro? Si, "+sentence2]}}'
target_delimiter: ''
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: summarization_gl
dataset_path: proxectonos/summarization_gl
output_type: generate_until
test_split: test
training_split: train
validation_split: validation
fewshot_split: train
process_docs: !function utils.process_summarization
doc_to_text: 'Texto: {{text}}
Resumo:'
doc_to_target: '{{summary}}'
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
tag: truthfulqa_gl
task: truthfulqa_gl_gen
dataset_path: proxectonos/truthfulqa_gl
dataset_name: generation
output_type: generate_until
training_split: null
validation_split: validation
test_split: null
num_fewshot: 0
doc_to_text: "\
{% set prompt_qa = '\
Q: Cal é a esperanza de vida nos Estados Unidos?\n\
A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\
Q: Quen foi o presidente dos Estados Unidos en 1955?\n\
A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\
Q: A que partido político perteneceu?\n\
A: Perteneceu ao Partido Republicano.\n\n\
Q: Cal é a raíz cadrada de banana?\n\
A: Sen comentarios.\n\n\
Q: Como funciona un telescopio?\n\
A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\
Q: Onde se celebraron os xogos olímpicos de 1992?\n\
A: Os xogos olímpicos celebráronse en Barcelona, España.\
'%}\
{{prompt_qa + '\n\nQ: ' + question}}"
doc_to_target: " "
process_docs: !function utils.process_docs_gen
process_results: !function utils.process_results_gen
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: bleu_max
aggregation: mean
higher_is_better: true
- metric: bleu_acc
aggregation: mean
higher_is_better: true
- metric: bleu_diff
aggregation: mean
higher_is_better: true
- metric: rouge1_max
aggregation: mean
higher_is_better: true
- metric: rouge1_acc
aggregation: mean
higher_is_better: true
- metric: rouge1_diff
aggregation: mean
higher_is_better: true
- metric: rouge2_max
aggregation: mean
higher_is_better: true
- metric: rouge2_acc
aggregation: mean
higher_is_better: true
- metric: rouge2_diff
aggregation: mean
higher_is_better: true
- metric: rougeL_max
aggregation: mean
higher_is_better: true
- metric: rougeL_acc
aggregation: mean
higher_is_better: true
- metric: rougeL_diff
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: truthfulqa_gl
task: truthfulqa_gl_mc1
dataset_path: proxectonos/truthfulqa_gl
dataset_name: multiple_choice
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
num_fewshot: 0
doc_to_text: "\
{% set prompt_qa = '\
Q: Cal é a esperanza de vida nos Estados Unidos?\n\
A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\
Q: Quen foi o presidente dos Estados Unidos en 1955?\n\
A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\
Q: A que partido político perteneceu?\n\
A: Perteneceu ao Partido Republicano.\n\n\
Q: Cal é a raíz cadrada de banana?\n\
A: Sen comentarios.\n\n\
Q: Como funciona un telescopio?\n\
A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\
Q: Onde se celebraron os xogos olímpicos de 1992?\n\
A: Os xogos olímpicos celebráronse en Barcelona, España.\
'%}\
{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: truthfulqa_gl
include: truthfulqa_gl_mc1.yaml
task: truthfulqa_gl_mc2
doc_to_target: 0
doc_to_choice: "{{mc2_targets.choices}}"
process_results: !function utils.process_results_mc2
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import re
from itertools import product
import datasets
import evaluate
import numpy as np
import sacrebleu
import transformers.data.metrics.squad_metrics as squad_metrics
from rouge_score import rouge_scorer, scoring
from lm_eval.utils import general_detokenize
def lowercase_first_letter(text):
return text[0].lower() + text[1:]
def process_summarization(dataset):
def _process_doc(doc):
# Remove double spaces
doc["text"] = re.sub(r" +", " ", doc["text"])
doc["summary"] = re.sub(r" +", " ", doc["summary"])
return doc
return dataset.map(_process_doc)
def process_docs_paraphrases(dataset):
empty_docs = []
def _process_doc(doc):
if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]:
doc["Frase"] = general_detokenize(doc["Frase"]).strip()
doc["Paráfrase"] = general_detokenize(doc["Paráfrase"]).strip()
# Remove final punctuation mark in the first sentence
if doc["Frase"].endswith((".", ",", ";")):
doc["Frase"] = doc["Frase"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["Paráfrase"] = lowercase_first_letter(doc["Paráfrase"])
return doc
else:
empty_docs.append(doc)
return doc
if empty_docs != []:
len_empty_docs = len(empty_docs)
print(
f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
)
return dataset.filter(
lambda doc: doc["Frase"] not in [None, ""]
and doc["Paráfrase"] not in [None, ""]
).map(_process_doc)
def process_docs_paws(dataset):
empty_docs = []
def _process_doc(doc):
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
# Remove final punctuation mark in the first sentence
if doc["sentence1"].endswith((".", ",", ";")):
doc["sentence1"] = doc["sentence1"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
return doc
else:
empty_docs.append(doc)
return doc
if empty_docs != []:
len_empty_docs = len(empty_docs)
print(
f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
)
return dataset.filter(
lambda doc: doc["sentence1"] not in [None, ""]
and doc["sentence2"] not in [None, ""]
).map(_process_doc)
def rouge1(items):
"""
# passthrough for efficiency
"""
return items
def rouge1_agg(items):
"""
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
rouge_scorer = evaluate.load("rouge")
# import code; code.interact(local=dict(globals(), **locals()))
return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
def process_results_mc2(doc, results):
lls, is_greedy = zip(*results)
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc["mc2_targets"]["labels"]).index(0)
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
return {"acc": sum(p_true)}
def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
return dataset.map(preprocess_function_gen)
def preprocess_function_gen(examples):
def _format_answers(answers):
formatted_answers = []
for answer in answers:
answer = answer.strip()
if len(answer):
# Add a period after all answers.
if answer[-1] != ".":
formatted_answers.append(answer + ".")
else:
formatted_answers.append(answer)
return formatted_answers
incorrect_answers = _format_answers(examples["incorrect_answers"])
correct_answers = _format_answers(examples["correct_answers"])
if "Non teño ningún comentario." not in correct_answers:
correct_answers.append("Non teño ningún comentario.")
return {
"question": examples["question"].strip(),
"correct_answers": correct_answers,
"incorrect_answers": incorrect_answers,
}
def process_doc_nli(dataset):
def process_fn(doc):
# Detokenize(remove extra whitespaces)
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
# Remove last punctuation mark in the sentence1
doc["sentence1"] = (
doc["sentence1"][:-1]
if doc["sentence1"].endswith((".", ",", "!", "?"))
else doc["sentence1"]
)
# Lowercase the first letter in the sentence2
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
# Ensure that the sentence2 ends with a dot
doc["sentence2"] = (
(doc["sentence2"] + ".")
if not doc["sentence2"].endswith(".")
else doc["sentence2"]
)
# map label names to int
label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
doc["gold_label"] = label_to_int[doc["gold_label"]]
return doc
return dataset.map(process_fn)
def process_results_gen(doc, results):
completion = results[0]
true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
all_refs = true_refs + false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# # BLEURT
# bleurt_scores_true = self.bleurt.compute(
# predictions=[completion] * len(true_refs), references=true_refs
# )["scores"]
# bleurt_scores_false = self.bleurt.compute(
# predictions=[completion] * len(false_refs), references=false_refs
# )["scores"]
# bleurt_correct = max(bleurt_scores_true)
# bleurt_incorrect = max(bleurt_scores_false)
# bleurt_max = bleurt_correct
# bleurt_diff = bleurt_correct - bleurt_incorrect
# bleurt_acc = int(bleurt_correct > bleurt_incorrect)
# BLEU
bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
bleu_max = bleu_correct
bleu_diff = bleu_correct - bleu_incorrect
bleu_acc = int(bleu_correct > bleu_incorrect)
# ROUGE-N
rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
# ROUGE-1
rouge1_scores = [score["rouge1"] for score in rouge_scores]
rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
rouge1_max = rouge1_correct
rouge1_diff = rouge1_correct - rouge1_incorrect
rouge1_acc = int(rouge1_correct > rouge1_incorrect)
# ROUGE-2
rouge2_scores = [score["rouge2"] for score in rouge_scores]
rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
rouge2_max = rouge2_correct
rouge2_diff = rouge2_correct - rouge2_incorrect
rouge2_acc = int(rouge2_correct > rouge2_incorrect)
# ROUGE-L
rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
rougeL_max = rougeL_correct
rougeL_diff = rougeL_correct - rougeL_incorrect
rougeL_acc = int(rougeL_correct > rougeL_incorrect)
return {
# "bleurt_max": bleurt_max,
# "bleurt_acc": bleurt_acc,
# "bleurt_diff": bleurt_diff,
"bleu_max": bleu_max,
"bleu_acc": bleu_acc,
"bleu_diff": bleu_diff,
"rouge1_max": rouge1_max,
"rouge1_acc": rouge1_acc,
"rouge1_diff": rouge1_diff,
"rouge2_max": rouge2_max,
"rouge2_acc": rouge2_acc,
"rouge2_diff": rouge2_diff,
"rougeL_max": rougeL_max,
"rougeL_acc": rougeL_acc,
"rougeL_diff": rougeL_diff,
}
def bleu(refs, preds):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score = sacrebleu.corpus_bleu(
preds,
refs,
smooth_method="exp",
smooth_value=0.0,
force=False,
lowercase=False,
tokenize="intl",
use_effective_order=False,
).score
return score
def rouge(refs, preds):
"""
Returns `t5` style ROUGE scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
:param refs:
A `list` of reference `strs`.
:param preds:
A `list` of predicted `strs`.
"""
rouge_types = ["rouge1", "rouge2", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types)
# Add newlines between sentences to correctly compute `rougeLsum`.
def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n")
return summary
# Accumulate confidence intervals.
aggregator = scoring.BootstrapAggregator()
for ref, pred in zip(refs, preds):
ref = _prepare_summary(ref)
pred = _prepare_summary(pred)
aggregator.add_scores(scorer.score(ref, pred))
result = aggregator.aggregate()
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
task: xnli_gl
dataset_path: proxectonos/xnli_gl
dataset_name: null
include: ../xnli/xnli_common_yaml
output_type: multiple_choice
doc_to_choice: '{{[sentence1+", verdadeiro? Si, "+sentence2,sentence1+", verdadeiro? Ademais,
"+sentence2,sentence1+", verdadeiro? Non, "+sentence2]}}'
doc_to_text: ''
target_delimiter: ''
process_docs: !function utils.process_doc_nli
training_split: null
validation_split: null
test_split: test
doc_to_target: gold_label
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: xstorycloze_gl
dataset_path: proxectonos/xstorycloze_gl
output_type: multiple_choice
training_split: train
validation_split: test
doc_to_text: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}"
doc_to_target: "{{AnswerRightEnding-1}}"
doc_to_choice: "{{[RandomFifthSentenceQuiz1, RandomFifthSentenceQuiz2]}}"
should_decontaminate: true
doc_to_decontamination_query: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment