Merge branch 'main' into llama

bf11ac93 · Baber · 83b1c564 · ade01428 · bf11ac93 · bf11ac93
Commit bf11ac93 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/basque_bench/arc_eu_easy.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_easy.yaml
+task: arc_eu_easy
+dataset_path: HiTZ/ARC-eu
+dataset_name: ARC-Easy
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: "Galdera: {{question}}\nErantzuna:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Galdera: {{question}}\nErantzuna:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/basque_bench.yaml
+++ b/lm_eval/tasks/basque_bench/basque_bench.yaml
 group: basque_bench
 task:
+    - arc_eu_challenge
+    - arc_eu_easy
    - belebele_eus_Latn
    - xstorycloze_eu
    - flores_eu
@@ -14,6 +16,7 @@ task:
    - xcopa_eu
    - mgsm_direct_eu
    - mgsm_native_cot_eu
+    - paws_eu
    - piqa_eu
 metadata:
  version: 1.0
--- a/lm_eval/tasks/basque_bench/paws_eu.yaml
+++ b/lm_eval/tasks/basque_bench/paws_eu.yaml
+task: paws_eu
+dataset_path: HiTZ/PAWS-eu
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+process_docs: !function utils.paws_process_docs
+doc_to_text: ''
+doc_to_target: label
+doc_to_choice: '{{[sentence1+", ezta? Ez, "+sentence2, sentence1+", ezta? Bai, "+sentence2]}}'
+target_delimiter: ''
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/utils.py
+++ b/lm_eval/tasks/basque_bench/utils.py
-from functools import partial
-
-
 # ~~~~~~~~~~~ XCOPA ~~~~~~~~~~~ #

 xcopa_connectors = {"cause": " Izan ere,", "effect": " Beraz,"}
@@ -18,4 +15,28 @@ def xcopa_doc_to_choice(doc):
    return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]


-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+# ~~~~~~~~~~~ PAWS-X ~~~~~~~~~~~ #
+
+
+def paws_process_docs(dataset):
+    empty_docs = []
+
+    def _process_doc(doc):
+        if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
+            # Remove final punctuation mark in the first sentence
+            if doc["sentence1"].endswith((".", ",", ";")):
+                doc["sentence1"] = doc["sentence1"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+
+    def lowercase_first_letter(text):
+        return text[0].lower() + text[1:]
+
+    return dataset.filter(
+        lambda doc: doc["sentence1"] not in [None, ""]
+        and doc["sentence2"] not in [None, ""]
+    ).map(_process_doc)
--- a/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/lm_eval/tasks/benchmarks/minerva_math.yaml
@@ -11,5 +11,8 @@ aggregate_metric_list:
  - metric: exact_match
    aggregation: mean
    weight_by_size: true
+  - metric: math_verify
+    aggregation: mean
+    weight_by_size: true
 metadata:
  version: 1.0
--- a/lm_eval/tasks/bigbench/README.md
+++ b/lm_eval/tasks/bigbench/README.md
@@ -30,6 +30,12 @@ Homepage: https://github.com/google/BIG-bench

 * `group_name`: `Short description`

+#### Tags
+
+* `bigbench_generate_until`
+* `bigbench_multiple_choice_a`
+* `bigbench_multiple_choice_b`
+
 #### Tasks

 * `task_name`: `1-sentence description of what this particular task does`

--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
-group: bigbench_generate_until
+tag: bigbench_generate_until
 dataset_path: hails/bigbench
 output_type: generate_until
 dataset_kwargs:

--- a/lm_eval/tasks/bigbench/multiple_choice_template_a_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_a_yaml
-group: bigbench_multiple_choice
+tag: bigbench_multiple_choice_a
 dataset_path: hails/bigbench
 dataset_kwargs:
  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods

--- a/lm_eval/tasks/bigbench/multiple_choice_template_b_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_b_yaml
-group: bigbench_multiple_choice
+tag: bigbench_multiple_choice_b
 dataset_path: hails/bigbench
 dataset_kwargs:
  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods

--- a/lm_eval/tasks/blimp/_template_yaml
+++ b/lm_eval/tasks/blimp/_template_yaml
@@ -9,5 +9,7 @@ should_decontaminate: true
 doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:
  - metric: acc
+    aggregation: mean
+    higher_is_better: true
 metadata:
  version: 1.0
--- a/lm_eval/tasks/catalan_bench/README.md
+++ b/lm_eval/tasks/catalan_bench/README.md
@@ -37,17 +37,39 @@ The datasets included in CatalanBench that have been made public in previous pub
 ### Citation
 Paper for CatalanBench coming soon.

-<!--```bibtex
-@inproceedings{baucells-2024-iberobench,
-    title = "IberoBench: A Benchmark for LLM Evaluation in Iberian Languages",
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
    author = "Baucells, Irene  and
-      AUTHORS, ADD",
-    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
-    year = "2024",
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
 }
 ```
-->

 ### Groups and Tasks


--- a/lm_eval/tasks/evalita_llm/README.md
+++ b/lm_eval/tasks/evalita_llm/README.md
+# Evalita-LLM
+
+### Paper
+
+Evalita-LLM, a new benchmark designed to evaluate Large Language
+Models (LLMs) on Italian tasks. The distinguishing and innovative features of
+Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of
+translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
+
+### Citation
+
+```bibtex
+@misc{magnini2025evalitallmbenchmarkinglargelanguage,
+      title={Evalita-LLM: Benchmarking Large Language Models on Italian},
+      author={Bernardo Magnini and Roberto Zanoli and Michele Resta and Martin Cimmino and Paolo Albano and Marco Madeddu and Viviana Patti},
+      year={2025},
+      eprint={2502.02289},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.02289},
+}
+```
+
+### Groups
+
+- `evalita-mp`: All tasks (perplexity and non-perplexity based).
+- `evalita-mp_gen`: Only generative tasks.
+- `evalita-mp_mc`: Only perplexity-based tasks.
+
+#### Tasks
+
+The following Evalita-LLM tasks can also be evaluated in isolation:
+  - `evalita-mp_te`: Textual Entailment
+  - `evalita-mp_sa`: Sentiment Analysis
+  - `evalita-mp_wic`: Word in Context
+  - `evalita-mp_hs`: Hate Speech Detection
+  - `evalita-mp_at`: Admission Tests
+  - `evalita-mp_faq`: FAQ
+  - `evalita-mp_sum_fp`:  Summarization
+  - `evalita-mp_ls`: Lexical Substitution
+  - `evalita-mp_ner_group`: Named Entity Recognition
+  - `evalita-mp_re`: Relation Extraction
+
+
+### Usage
+
+```bash
+
+lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size auto
+```
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/evalita_llm/_at_template_yaml
+++ b/lm_eval/tasks/evalita_llm/_at_template_yaml
+dataset_path: evalitahf/admission_test
+output_type: multiple_choice
+test_split: test
+fewshot_split: dev
+validation_split: test
+doc_to_target: Correct
+doc_to_choice: ["A", "B", "C", "D", "E"]
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp.yaml
+group: evalita-mp
+group_alias: Evalita-LLM
+task:
+  - evalita-mp_te
+  - evalita-mp_sa
+  - evalita-mp_wic
+  - evalita-mp_hs
+  - evalita-mp_at
+  - evalita-mp_faq
+  - evalita-mp_sum_fp
+  - evalita-mp_ls
+  - evalita-mp_ner_group
+  - evalita-mp_re
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p1.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p1.yaml
+tag: evalita-mp_at_tasks
+include: _at_template_yaml
+task: evalita-mp_at_prompt-1
+task_alias: prompt-1
+#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
+#doc_to_text: "Dato il seguente caso clinico: '{{background}}' qual è la risposta corretta alla domanda: '{{domanda}}'?"
+doc_to_text: "Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?"
+doc_to_choice: "{{[A,B,C,D,E]}}"
+doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p2.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p2.yaml
+tag: evalita-mp_at_tasks
+include: _at_template_yaml
+task: evalita-mp_at_prompt-2
+task_alias: prompt-2
+#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
+#doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente caso clinico: '{{background}}' qual è la risposta corretta alla domanda: '{{domanda}}'?"
+doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?"
+doc_to_choice: "{{[A,B,C,D,E]}}"
+doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p3.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p3.yaml
+tag: evalita-mp_at_tasks
+include: _at_template_yaml
+task: evalita-mp_at_prompt-3
+task_alias: prompt-3
+#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
+#doc_to_text: "Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nE: {{E}}\nRisposta:"
+doc_to_text: "Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nE: {{E}}\nRisposta:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p4.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p4.yaml
+tag: evalita-mp_at_tasks
+include: _at_template_yaml
+task: evalita-mp_at_prompt-4
+task_alias: prompt-4
+#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
+doc_to_text: "Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nE: {{E}}\nRisposta:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p5.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p5.yaml
+tag: evalita-mp_at_tasks
+include: _at_template_yaml
+task: evalita-mp_at_prompt-5
+task_alias: prompt-5
+#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
+#doc_to_text: "Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' è:"
+doc_to_text: "Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è:"
+doc_to_choice: "{{[A,B,C,D,E]}}"
+doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1
--- a/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p6.yaml
+++ b/lm_eval/tasks/evalita_llm/_evalita-mp_at_task_p6.yaml
+tag: evalita-mp_at_tasks
+include: _at_template_yaml
+task: evalita-mp_at_prompt-6
+task_alias: prompt-6
+#doc_to_text: "Rispondi alla domanda a scelta multipla considerando le informazioni del testo seguente.\nTesto: {{background}}\nDomanda: {{domanda}}\nOpzioni: A: {{A}} B: {{B}} C: {{C}} D: {{D}}"
+#doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' è:"
+doc_to_text: "Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è:"
+doc_to_choice: "{{[A,B,C,D,E]}}"
+doc_to_target: "{{ A if Correct == 'A' else B if Correct == 'B' else C if Correct == 'C' else D if Correct == 'D' else E}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1