Commit 25869601 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/hf_vlms.py
parents 56f40c53 c1d8795d
tag: phrases_va
dataset_path: gplsi/CA-VA_alignment_test
output_type: generate_until
training_split: null
validation_split: null
test_split: test
fewshot_split: test
num_fewshot: 5
target_delimiter: ' '
generation_kwargs:
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: ter
aggregation: ter
higher_is_better: false
- metric: chrf
aggregation: chrf
higher_is_better: true
metadata:
version: 1.0
# File generated by `create-yamls.py`
include: _phrases_va_common.yaml
task: phrases_ca-va
doc_to_text: 'Oració en català: {{ca}}
Oració en valencià:'
doc_to_target: '{{va}}'
# File generated by `create-yamls.py`
include: _phrases_va_common.yaml
task: phrases_va-ca
doc_to_text: 'Oració en valencià: {{va}}
Oració en català:'
doc_to_target: '{{ca}}'
task: piqa_ca
dataset_path: projecte-aina/piqa_ca
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
doc_to_text: "Pregunta: {{goal}}\nResposta:"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: siqa_ca
dataset_path: projecte-aina/siqa_ca
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
doc_to_text: "Pregunta: {{context}} {{question}}\nResposta:"
target_delimiter: " "
doc_to_choice: "{{[answerA, answerB, answerC]}}"
doc_to_target: "{{ (label|int) - 1 }}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: teca
dataset_path: projecte-aina/teca
dataset_name: null
training_split: train
validation_split: validation
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_doc_nli
doc_to_text: ""
doc_to_target: label
target_delimiter: ""
doc_to_choice: '{{[premise + ", correcte? Sí, " + hypothesis, premise + ", correcte? A més, " + hypothesis, premise + ", correcte? No, " + hypothesis]}}'
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import re
from itertools import product
import evaluate
import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.utils import general_detokenize
def lowercase_first_letter(text):
return text[0].lower() + text[1:]
def process_doc_nli(dataset):
def process_fn(doc):
# Detokenize(remove extra whitespaces)
doc["premise"] = general_detokenize(doc["premise"]).strip()
doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip()
# Remove last punctuation mark in the premise
doc["premise"] = (
doc["premise"][:-1]
if doc["premise"].endswith((".", ",", "!", "?"))
else doc["premise"]
)
# Lowercase the first letter in the hypothesis
doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"])
# Ensure that the hypothesis ends with a dot
doc["hypothesis"] = (
(doc["hypothesis"] + ".")
if not doc["hypothesis"].endswith(".")
else doc["hypothesis"]
)
return doc
return dataset.map(process_fn)
def process_results_coqcat(doc, results):
# Get all possible answers and compute the scores
turn_id = len(doc["questions"])
answers = [doc["answers"]["input_text"][turn_id - 1]]
additional_answers_list = doc.get("additional_answers")
if additional_answers_list:
for key, additional_answers in additional_answers_list.items():
if additional_answers["input_text"][turn_id - 1].lower() not in map(
str.lower, answers
):
answers.append(additional_answers["input_text"][turn_id - 1])
gold_list = answers
pred = results[0].strip().split("\n")[0]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum = 0.0
em_sum = 0.0
if len(gold_list) > 1:
for i in range(len(gold_list)):
gold_answers = gold_list[0:i] + gold_list[i + 1 :]
# predictions compared against (n) golds and take maximum
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
else:
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
# import code; code.interact(local=dict(globals(), **locals()))
return {
"em": em_sum / max(1, len(gold_list)),
"f1": f1_sum / max(1, len(gold_list)),
}
def process_results_qa(doc, results):
preds = results[0]
reference = doc["answers"][0]["text"]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum = squad_metrics.compute_f1(reference, preds)
exact_match = squad_metrics.compute_exact(reference, preds)
return {"f1": f1_sum, "exact_match": exact_match}
def process_doc_cabreu(dataset):
def process_fn(doc):
# Remove duplicate spaces
doc["content"] = re.sub(r" +", " ", doc["content"])
for summary_type, index in product(
["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]
):
doc["summaries"][summary_type][index] = re.sub(
r" +", " ", doc["summaries"][summary_type][index]
)
return doc
return dataset.map(process_fn)
def process_docs_paraphrases(dataset):
empty_docs = []
def _process_doc(doc):
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
# Remove final punctuation mark in the first sentence
if doc["sentence1"].endswith((".", ",", ";")):
doc["sentence1"] = doc["sentence1"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
return doc
else:
empty_docs.append(doc)
return doc
return dataset.filter(
lambda doc: doc["sentence1"] not in [None, ""]
and doc["sentence2"] not in [None, ""]
).map(_process_doc)
def process_docs_copa_ca(dataset):
def _process_doc(doc):
doc["choice1"] = lowercase_first_letter(doc["choice1"])
doc["choice2"] = lowercase_first_letter(doc["choice2"])
return doc
return dataset.map(_process_doc)
def rouge1(items):
"""
# passthrough for efficiency
"""
return items
def rouge1_agg(items):
"""
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
rouge_scorer = evaluate.load("rouge")
return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
task: wnli_ca
dataset_path: projecte-aina/wnli-ca
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: null
doc_to_text: "{{sentence1}}\nPregunta: {{sentence2}} Cert o Fals?\nResposta:"
doc_to_target: label
doc_to_choice: ["Fals", "Cert"]
metric_list:
- metric: acc
metadata:
version: 1.0
task: xnli_ca
dataset_path: projecte-aina/xnli-ca
dataset_name: null
include: ../xnli/xnli_common_yaml
output_type: multiple_choice
doc_to_choice: '{{[premise+", correcte? Sí, "+hypothesis,premise+", correcte? A més,
"+hypothesis,premise+", correcte? No, "+hypothesis]}}'
doc_to_text: ''
target_delimiter: ''
process_docs: !function utils.process_doc_nli
training_split: null
validation_split: validation
doc_to_target: label
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: xquad_ca
dataset_path: projecte-aina/xquad-ca
dataset_name: null
output_type: generate_until
doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:"
doc_to_target: '{{answers[0]["text"]}}'
validation_split: null
test_split: test
target_delimiter: ' '
process_results: !function utils.process_results_qa
generation_kwargs:
until:
- "\n"
do_sample: false
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: xstorycloze_ca
dataset_path: projecte-aina/xstorycloze_ca
dataset_name: ca
output_type: multiple_choice
training_split: train
validation_split: eval
doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}"
doc_to_target: "{{answer_right_ending-1}}"
doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}"
should_decontaminate: true
doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
dataset_path: EleutherAI/csatqa dataset_path: HAERAE-HUB/csatqa
test_split: test test_split: test
output_type: multiple_choice output_type: multiple_choice
process_docs: !function utils.process_docs process_docs: !function utils.process_docs
......
...@@ -26,7 +26,7 @@ Homepage: https://github.com/hitz-zentroa/latxa ...@@ -26,7 +26,7 @@ Homepage: https://github.com/hitz-zentroa/latxa
### Groups and Tasks ### Groups and Tasks
#### Groups #### Tags
* `eus_exams_eu`: The Basque version of the exams. * `eus_exams_eu`: The Basque version of the exams.
* `eus_exams_es`: The Spanish version of the exams. * `eus_exams_es`: The Spanish version of the exams.
......
include: eus_exams include: eus_exams
group: tag:
- eus_exams_es - eus_exams_es
doc_to_text: "Pregunta: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nRespuesta:" doc_to_text: "Pregunta: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nRespuesta:"
include: eus_exams include: eus_exams
group: tag:
- eus_exams_eu - eus_exams_eu
doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:" doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:"
# GalicianBench
### Paper
GalicianBench is a benchmark for evaluating language models in Galician tasks. This is, it evaluates the ability of a language model to understand and generate Galician text. GalicianBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of GalicianBench will be published in a paper soon.
The new evaluation datasets included in GalicianBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| Belebele_gl | Reading Comprehension | https://huggingface.co/datasets/proxectonos/belebele_gl |
| GalCoLA | Linguistic Acceptability | https://huggingface.co/datasets/proxectonos/galcola |
| MGSM_ca | Math | https://huggingface.co/datasets/proxectonos/mgsm_gl |
| Parafrases_gl | Paraphrasing | https://huggingface.co/datasets/proxectonos/parafrases_gl |
| PAWS-gl | Paraphrasing | https://huggingface.co/datasets/proxectonos/PAWS-gl |
| OpenBookQA_gl | Question Answering | https://huggingface.co/datasets/proxectonos/openbookqa_gl |
| Summarization_gl | Summarization | https://huggingface.co/datasets/proxectonos/summarization_gl |
| TruthfulQA_gl | Truthfulness | https://huggingface.co/datasets/proxectonos/truthfulqa_gl |
| xnli_gl | NLI | https://huggingface.co/datasets/proxectonos/xnli_gl |
| xstorycloze_gl | Commonsense Reasoning | https://huggingface.co/datasets/proxectonos/xstorycloze_gl |
The datasets included in GalicianBench that have been made public in previous pubications are:
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
| FLORES_gl | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
### Citation
Paper for GalicianBench coming soon.
### Groups and Tasks
#### Groups
- `galician_bench`: All tasks included in GalicianBench.
- `flores_gl`: All FLORES translation tasks from or to Galician.
#### Tasks
The following tasks evaluate tasks on GalicianBench dataset using various scoring methods.
- `belebele_glg_Latn`
- `flores_gl`
- `flores_gl-ca`
- `flores_gl-de`
- `flores_gl-en`
- `flores_gl-es`
- `flores_gl-eu`
- `flores_gl-fr`
- `flores_gl-it`
- `flores_gl-pt`
- `flores_ca-gl`
- `flores_de-gl`
- `flores_en-gl`
- `flores_es-gl`
- `flores_eu-gl`
- `flores_fr-gl`
- `flores_it-gl`
- `flores_pt-gl`
- `galcola`
- `summarization_gl`
- `parafrases_gl`
- `paws_gl`
- `openbookqa_gl`
- `mgsm_direct_gl`
- `truthfulqa_gl`
- `xnli_gl`
- `xstorycloze_gl`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation?
* [ ] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: belebele_glg_Latn
include: ../belebele/_default_template_yaml
dataset_path: proxectonos/belebele_gl
fewshot_split: train
test_split: train
metadata:
version: 1.0
dataset_path: facebook/flores
dataset_name: all
output_type: generate_until
#! The test split of flores is not publicly available! (See paper section 6.1)
#! We are using `dev` and `devtest` splits, but they're mapped to train/validation/test in `data/flores/flores.py`.
training_split: dev
validation_split: dev
test_split: devtest
fewshot_split: dev
target_delimiter: ''
generation_kwargs:
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: ter
aggregation: ter
higher_is_better: false
- metric: chrf
aggregation: chrf
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
# ruff: noqa: E731, E741
"""
Script to generate task YAMLs for the FLORES-200 dataset.
Based on `tasks/translation/utils.py`.
"""
import argparse
import itertools
import yaml
from langcodes import Language
# utils
flatten = lambda l: list(itertools.chain(*l))
# constants
_LANGUAGES = [
"ace_Arab",
"bam_Latn",
"dzo_Tibt",
"hin_Deva",
"khm_Khmr",
"mag_Deva",
"pap_Latn",
"sot_Latn",
"tur_Latn",
"ace_Latn",
"ban_Latn",
"ell_Grek",
"hne_Deva",
"kik_Latn",
"mai_Deva",
"pbt_Arab",
"spa_Latn",
"twi_Latn",
"acm_Arab",
"bel_Cyrl",
"eng_Latn",
"hrv_Latn",
"kin_Latn",
"mal_Mlym",
"pes_Arab",
"srd_Latn",
"tzm_Tfng",
"acq_Arab",
"bem_Latn",
"epo_Latn",
"hun_Latn",
"kir_Cyrl",
"mar_Deva",
"plt_Latn",
"srp_Cyrl",
"uig_Arab",
"aeb_Arab",
"ben_Beng",
"est_Latn",
"hye_Armn",
"kmb_Latn",
"min_Arab",
"pol_Latn",
"ssw_Latn",
"ukr_Cyrl",
"afr_Latn",
"bho_Deva",
"eus_Latn",
"ibo_Latn",
"kmr_Latn",
"min_Latn",
"por_Latn",
"sun_Latn",
"umb_Latn",
"ajp_Arab",
"bjn_Arab",
"ewe_Latn",
"ilo_Latn",
"knc_Arab",
"mkd_Cyrl",
"prs_Arab",
"swe_Latn",
"urd_Arab",
"aka_Latn",
"bjn_Latn",
"fao_Latn",
"ind_Latn",
"knc_Latn",
"mlt_Latn",
"quy_Latn",
"swh_Latn",
"uzn_Latn",
"als_Latn",
"bod_Tibt",
"fij_Latn",
"isl_Latn",
"kon_Latn",
"mni_Beng",
"ron_Latn",
"szl_Latn",
"vec_Latn",
"amh_Ethi",
"bos_Latn",
"fin_Latn",
"ita_Latn",
"kor_Hang",
"mos_Latn",
"run_Latn",
"tam_Taml",
"vie_Latn",
"apc_Arab",
"bug_Latn",
"fon_Latn",
"jav_Latn",
"lao_Laoo",
"mri_Latn",
"rus_Cyrl",
"taq_Latn",
"war_Latn",
"arb_Arab",
"bul_Cyrl",
"fra_Latn",
"jpn_Jpan",
"lij_Latn",
"mya_Mymr",
"sag_Latn",
"taq_Tfng",
"wol_Latn",
"arb_Latn",
"cat_Latn",
"fur_Latn",
"kab_Latn",
"lim_Latn",
"nld_Latn",
"san_Deva",
"tat_Cyrl",
"xho_Latn",
"ars_Arab",
"ceb_Latn",
"fuv_Latn",
"kac_Latn",
"lin_Latn",
"nno_Latn",
"sat_Olck",
"tel_Telu",
"ydd_Hebr",
"ary_Arab",
"ces_Latn",
"gaz_Latn",
"kam_Latn",
"lit_Latn",
"nob_Latn",
"scn_Latn",
"tgk_Cyrl",
"yor_Latn",
"arz_Arab",
"cjk_Latn",
"gla_Latn",
"kan_Knda",
"lmo_Latn",
"npi_Deva",
"shn_Mymr",
"tgl_Latn",
"yue_Hant",
"asm_Beng",
"ckb_Arab",
"gle_Latn",
"kas_Arab",
"ltg_Latn",
"nso_Latn",
"sin_Sinh",
"tha_Thai",
"zho_Hans",
"ast_Latn",
"crh_Latn",
"glg_Latn",
"kas_Deva",
"ltz_Latn",
"nus_Latn",
"slk_Latn",
"tir_Ethi",
"zho_Hant",
"awa_Deva",
"cym_Latn",
"grn_Latn",
"kat_Geor",
"lua_Latn",
"nya_Latn",
"slv_Latn",
"tpi_Latn",
"zsm_Latn",
"ayr_Latn",
"dan_Latn",
"guj_Gujr",
"kaz_Cyrl",
"lug_Latn",
"oci_Latn",
"smo_Latn",
"tsn_Latn",
"zul_Latn",
"azb_Arab",
"deu_Latn",
"hat_Latn",
"kbp_Latn",
"luo_Latn",
"ory_Orya",
"sna_Latn",
"tso_Latn",
"azj_Latn",
"dik_Latn",
"hau_Latn",
"kea_Latn",
"lus_Latn",
"pag_Latn",
"snd_Arab",
"tuk_Latn",
"bak_Cyrl",
"dyu_Latn",
"heb_Hebr",
"khk_Cyrl",
"lvs_Latn",
"pan_Guru",
"som_Latn",
"tum_Latn",
]
LANGUAGE_PAIRS = [
(a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1 :]
]
LANGUAGES_OF_INTEREST = [
"cat_Latn",
"spa_Latn",
"eng_Latn",
"glg_Latn",
"eus_Latn",
"ita_Latn",
"deu_Latn",
"por_Latn",
"fra_Latn",
]
MAIN_LANG = "glg_Latn"
LANGUAGE_PAIRS = [
(a, b)
for (a, b) in LANGUAGE_PAIRS
if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and MAIN_LANG in (a, b)
]
# auxiliary functions
code_to_language_name = lambda code: Language.make(
language=Language.get(code)["language"]
).display_name()
code_to_short_name = lambda code: Language.get(code)["language"]
jinja_var = (
lambda s: "{{" + s + "}}"
) # wrapper to avoid having to escape { } in format strings
def doc_to_text(src: str, tgt: str) -> str:
src_name, tgt_name = map(code_to_language_name, [src, tgt])
return f"""\
{src_name} sentence: {jinja_var('sentence_' + src)}
{tgt_name} sentence:"""
def doc_to_target(tgt: str) -> str:
return f"{jinja_var('sentence_' + tgt)}"
# main function
def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
"""
Generate a YAML file for each translation direction.
"""
err = []
for src, tgt in LANGUAGE_PAIRS:
# do both translation directions for each lang pair
for src, tgt in [(src, tgt), (tgt, src)]:
lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}"
yaml_file_name = f"flores_{lang_pair_name}.yaml"
try:
with open(
f"{output_dir}/{yaml_file_name}",
"w" if overwrite else "x",
encoding="utf-8",
) as outfile:
print(f"Creating {yaml_file_name}...")
outfile.write("# File generated by `create-yamls.py`\n")
yaml.dump(
{
# "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
# "group": "flores_gl",
"include": "_flores_common_yaml",
"task": f"flores_{lang_pair_name}",
"doc_to_text": doc_to_text(src, tgt),
"doc_to_target": doc_to_target(tgt),
},
outfile,
sort_keys=False,
)
except FileExistsError:
err.append(yaml_file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist:"
f" {', '.join(err)}"
"\nUse flag --overwrite to overwrite them."
)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
args = parser.parse_args()
gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
if __name__ == "__main__":
main()
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_ca-gl
doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
Galician sentence:'
doc_to_target: '{{sentence_glg_Latn}}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment