Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-es.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-es.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-es
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  Spanish sentence:'
+doc_to_target: '{{sentence_spa_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-fr.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-fr.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-fr
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  French sentence:'
+doc_to_target: '{{sentence_fra_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-gl.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-gl.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-gl
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-it.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-it.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-it
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  Italian sentence:'
+doc_to_target: '{{sentence_ita_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-pt.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-pt.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-pt
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+
+  Portuguese sentence:'
+doc_to_target: '{{sentence_por_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu.yaml
+group: flores_eu
+task:
+  - flores_es-eu
+  - flores_eu-es
+  - flores_en-eu
+  - flores_eu-en
+  - flores_eu-pt
+  - flores_pt-eu
+  - flores_eu-it
+  - flores_it-eu
+  - flores_eu-fr
+  - flores_fr-eu
+  - flores_eu-ca
+  - flores_ca-eu
+  - flores_eu-gl
+  - flores_gl-eu
+  - flores_eu-de
+  - flores_de-eu
+aggregate_metric_list:
+  - metric: bleu
+    aggregation: mean
+    weight_by_size: false
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_fr-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_fr-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_fr-eu
+doc_to_text: 'French sentence: {{sentence_fra_Latn}}
+
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_gl-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_gl-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_gl-eu
+doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
+
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_it-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_it-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_it-eu
+doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
+
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_pt-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_pt-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_pt-eu
+doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
+
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/mgsm_cot_native_eu.yaml
+++ b/lm_eval/tasks/basque_bench/mgsm_cot_native_eu.yaml
+task: mgsm_native_cot_eu
+dataset_path: HiTZ/MGSM-eu
+dataset_name: null
+doc_to_target: '{% if answer is not none %}{{answer[27:]}}{% else %}{{answer_number|string}}{%endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nErantzuna urratsez urrats:"}}{% else %}{{"Galdera: "+question+"\nErantzuna urratsez urrats:"}}{% endif %}'
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: " "
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+    - "Galdera:"
+    - </s>
+    - <|im_end|>
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "Erantzuna [$%]? ?(-?[0-9]+([ .,][0-9.,]+)?) ?[$%]? da"
+      - function: "take_first"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - " "
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/mgsm_direct_eu.yaml
+++ b/lm_eval/tasks/basque_bench/mgsm_direct_eu.yaml
+task: mgsm_direct_eu
+dataset_path: HiTZ/MGSM-eu
+dataset_name: null
+doc_to_target: '{{answer_number|string}}'
+doc_to_text: '{% if answer is not none %}{{question+"\nErantzuna:"}}{% else %}{{"Galdera: "+question+"\nErantzuna:"}}{% endif %}'
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: " "
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+    - "Galdera:"
+    - </s>
+    - <|im_end|>
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - name: flexible-extract
+    filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[0-9]+([ .,][0-9.,]+)?)
+    - function: take_first
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - " "
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/piqa_eu.yaml
+++ b/lm_eval/tasks/basque_bench/piqa_eu.yaml
+task: piqa_eu
+dataset_path: HiTZ/PIQA-eu
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+doc_to_text: "Galdera: {{goal}}\nErantzuna:"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/utils.py
+++ b/lm_eval/tasks/basque_bench/utils.py
+from functools import partial
+
+
+# ~~~~~~~~~~~ XCOPA ~~~~~~~~~~~ #
+
+xcopa_connectors = {"cause": " Izan ere,", "effect": " Beraz,"}
+
+
+def xcopa_doc_to_text(doc):
+    conn = xcopa_connectors[doc["question"]]
+    return doc["premise"].strip() + f"{conn}"
+
+
+def xcopa_doc_to_choice(doc):
+    def convert_choice(choice):
+        return choice[0].lower() + choice[1:]
+
+    return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
--- a/lm_eval/tasks/basque_bench/wnli_eu.yaml
+++ b/lm_eval/tasks/basque_bench/wnli_eu.yaml
+task: wnli_eu
+dataset_path: HiTZ/wnli-eu
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+doc_to_text: "{{sentence1}}\nGaldera: {{sentence2}} Egia edo Gezurra?\nErantzuna:"
+doc_to_target: label
+doc_to_choice: ["Gezurra", "Egia"]
+metric_list:
+  - metric: acc
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/xcopa_eu.yaml
+++ b/lm_eval/tasks/basque_bench/xcopa_eu.yaml
+task: xcopa_eu
+dataset_path: HiTZ/XCOPA-eu
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: !function utils.xcopa_doc_to_text
+doc_to_target: label
+doc_to_choice: !function utils.xcopa_doc_to_choice
+metric_list:
+  - metric: acc
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basqueglue/README.md
+++ b/lm_eval/tasks/basqueglue/README.md
@@ -43,11 +43,15 @@ Homepage: `https://github.com/hitz-zentroa/latxa`
 }
 ```

-### Groups and Tasks
+### Groups, Tags, and Tasks

 #### Groups

-* `basque-glue`: First version of the implementation
+None.
+
+#### Tags
+
+* `basque-glue`: First version of the implementation. Calls all subtasks, but does not average.

 #### Tasks


--- a/lm_eval/tasks/basqueglue/bec.yaml
+++ b/lm_eval/tasks/basqueglue/bec.yaml
-group: basque-glue
+tag: basque-glue
 task: bec2016eu
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: bec

--- a/lm_eval/tasks/basqueglue/bhtc.yaml
+++ b/lm_eval/tasks/basqueglue/bhtc.yaml
-group: basque-glue
+tag: basque-glue
 task: bhtc_v2
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: bhtc

--- a/lm_eval/tasks/basqueglue/coref.yaml
+++ b/lm_eval/tasks/basqueglue/coref.yaml
-group: basque-glue
+tag: basque-glue
 task: epec_koref_bin
 dataset_path: orai-nlp/basqueGLUE
 dataset_name: coref