Commit bf11ac93 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into llama

parents 83b1c564 ade01428
task: arc_eu_easy
dataset_path: HiTZ/ARC-eu
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
doc_to_text: "Galdera: {{question}}\nErantzuna:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Galdera: {{question}}\nErantzuna:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
group: basque_bench
task:
- arc_eu_challenge
- arc_eu_easy
- belebele_eus_Latn
- xstorycloze_eu
- flores_eu
......@@ -14,6 +16,7 @@ task:
- xcopa_eu
- mgsm_direct_eu
- mgsm_native_cot_eu
- paws_eu
- piqa_eu
metadata:
version: 1.0
task: paws_eu
dataset_path: HiTZ/PAWS-eu
dataset_name: null
output_type: multiple_choice
test_split: test
process_docs: !function utils.paws_process_docs
doc_to_text: ''
doc_to_target: label
doc_to_choice: '{{[sentence1+", ezta? Ez, "+sentence2, sentence1+", ezta? Bai, "+sentence2]}}'
target_delimiter: ''
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
from functools import partial
# ~~~~~~~~~~~ XCOPA ~~~~~~~~~~~ #
xcopa_connectors = {"cause": " Izan ere,", "effect": " Beraz,"}
......@@ -18,4 +15,28 @@ def xcopa_doc_to_choice(doc):
return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~~~~~~~~~ PAWS-X ~~~~~~~~~~~ #
def paws_process_docs(dataset):
empty_docs = []
def _process_doc(doc):
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
# Remove final punctuation mark in the first sentence
if doc["sentence1"].endswith((".", ",", ";")):
doc["sentence1"] = doc["sentence1"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
return doc
else:
empty_docs.append(doc)
return doc
def lowercase_first_letter(text):
return text[0].lower() + text[1:]
return dataset.filter(
lambda doc: doc["sentence1"] not in [None, ""]
and doc["sentence2"] not in [None, ""]
).map(_process_doc)
......@@ -11,5 +11,8 @@ aggregate_metric_list:
- metric: exact_match
aggregation: mean
weight_by_size: true
- metric: math_verify
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
......@@ -30,6 +30,12 @@ Homepage: https://github.com/google/BIG-bench
* `group_name`: `Short description`
#### Tags
* `bigbench_generate_until`
* `bigbench_multiple_choice_a`
* `bigbench_multiple_choice_b`
#### Tasks
* `task_name`: `1-sentence description of what this particular task does`
......
group: bigbench_generate_until
tag: bigbench_generate_until
dataset_path: hails/bigbench
output_type: generate_until
dataset_kwargs:
......
group: bigbench_multiple_choice
tag: bigbench_multiple_choice_a
dataset_path: hails/bigbench
dataset_kwargs:
# num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
......
group: bigbench_multiple_choice
tag: bigbench_multiple_choice_b
dataset_path: hails/bigbench
dataset_kwargs:
# num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
......
......@@ -9,5 +9,7 @@ should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
......@@ -37,17 +37,39 @@ The datasets included in CatalanBench that have been made public in previous pub
### Citation
Paper for CatalanBench coming soon.
<!--```bibtex
@inproceedings{baucells-2024-iberobench,
title = "IberoBench: A Benchmark for LLM Evaluation in Iberian Languages",
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
AUTHORS, ADD",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
year = "2024",
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
-->
### Groups and Tasks
......
# Evalita-LLM
### Paper
Evalita-LLM, a new benchmark designed to evaluate Large Language
Models (LLMs) on Italian tasks. The distinguishing and innovative features of
Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of
translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
### Citation
```bibtex
@misc{magnini2025evalitallmbenchmarkinglargelanguage,
title={Evalita-LLM: Benchmarking Large Language Models on Italian},
author={Bernardo Magnini and Roberto Zanoli and Michele Resta and Martin Cimmino and Paolo Albano and Marco Madeddu and Viviana Patti},
year={2025},
eprint={2502.02289},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.02289},
}
```
### Groups
- `evalita-mp`: All tasks (perplexity and non-perplexity based).
- `evalita-mp_gen`: Only generative tasks.
- `evalita-mp_mc`: Only perplexity-based tasks.
#### Tasks
The following Evalita-LLM tasks can also be evaluated in isolation:
- `evalita-mp_te`: Textual Entailment
- `evalita-mp_sa`: Sentiment Analysis
- `evalita-mp_wic`: Word in Context
- `evalita-mp_hs`: Hate Speech Detection
- `evalita-mp_at`: Admission Tests
- `evalita-mp_faq`: FAQ
- `evalita-mp_sum_fp`: Summarization
- `evalita-mp_ls`: Lexical Substitution
- `evalita-mp_ner_group`: Named Entity Recognition
- `evalita-mp_re`: Relation Extraction
### Usage
```bash
lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size auto
```
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: evalitahf/admission_test
output_type: multiple_choice
test_split: test
fewshot_split: dev
validation_split: test
doc_to_target: Correct
doc_to_choice: ["A", "B", "C", "D", "E"]
metadata:
version: 1
group: evalita-mp
group_alias: Evalita-LLM
task:
- evalita-mp_te
- evalita-mp_sa
- evalita-mp_wic
- evalita-mp_hs
- evalita-mp_at
- evalita-mp_faq
- evalita-mp_sum_fp
- evalita-mp_ls
- evalita-mp_ner_group
- evalita-mp_re
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 1
tag: evalita-mp_at_tasks
include: _at_template_yaml
task: evalita-mp_at_prompt-1
task_alias: prompt-1
#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
#doc_to_text: "Dato il seguente caso clinico: '{{background}}' qual è la risposta corretta alla domanda: '{{domanda}}'?"
doc_to_text: "Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?"
doc_to_choice: "{{[A,B,C,D,E]}}"
doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1
tag: evalita-mp_at_tasks
include: _at_template_yaml
task: evalita-mp_at_prompt-2
task_alias: prompt-2
#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
#doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente caso clinico: '{{background}}' qual è la risposta corretta alla domanda: '{{domanda}}'?"
doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?"
doc_to_choice: "{{[A,B,C,D,E]}}"
doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1
tag: evalita-mp_at_tasks
include: _at_template_yaml
task: evalita-mp_at_prompt-3
task_alias: prompt-3
#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
#doc_to_text: "Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nE: {{E}}\nRisposta:"
doc_to_text: "Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nE: {{E}}\nRisposta:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1
tag: evalita-mp_at_tasks
include: _at_template_yaml
task: evalita-mp_at_prompt-4
task_alias: prompt-4
#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
doc_to_text: "Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nE: {{E}}\nRisposta:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1
tag: evalita-mp_at_tasks
include: _at_template_yaml
task: evalita-mp_at_prompt-5
task_alias: prompt-5
#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
#doc_to_text: "Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' è:"
doc_to_text: "Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è:"
doc_to_choice: "{{[A,B,C,D,E]}}"
doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1
tag: evalita-mp_at_tasks
include: _at_template_yaml
task: evalita-mp_at_prompt-6
task_alias: prompt-6
#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
#doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' è:"
doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è:"
doc_to_choice: "{{[A,B,C,D,E]}}"
doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment