Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt

Merge branch 'group-agg-rework' of...
Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
88486e57 · lintangsutawika · 5971f2ca · ba73d131 · 88486e57 · 88486e57
Commit 88486e57 authored Jul 05, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
-group: bbh_fewshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test

--- a/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
+group: bbh_zeroshot
+task:
+  - bbh_zeroshot_boolean_expressions
+  - bbh_zeroshot_causal_judgement
+  - bbh_zeroshot_date_understanding
+  - bbh_zeroshot_disambiguation_qa
+  - bbh_zeroshot_dyck_languages
+  - bbh_zeroshot_formal_fallacies
+  - bbh_zeroshot_geometric_shapes
+  - bbh_zeroshot_hyperbaton
+  - bbh_zeroshot_logical_deduction_five_objects
+  - bbh_zeroshot_logical_deduction_seven_objects
+  - bbh_zeroshot_logical_deduction_three_objects
+  - bbh_zeroshot_movie_recommendation
+  - bbh_zeroshot_multistep_arithmetic_two
+  - bbh_zeroshot_navigate
+  - bbh_zeroshot_object_counting
+  - bbh_zeroshot_penguins_in_a_table
+  - bbh_zeroshot_reasoning_about_colored_objects
+  - bbh_zeroshot_ruin_names
+  - bbh_zeroshot_salient_translation_error_detection
+  - bbh_zeroshot_snarks
+  - bbh_zeroshot_sports_understanding
+  - bbh_zeroshot_temporal_sequences
+  - bbh_zeroshot_tracking_shuffled_objects_five_objects
+  - bbh_zeroshot_tracking_shuffled_objects_seven_objects
+  - bbh_zeroshot_tracking_shuffled_objects_three_objects
+  - bbh_zeroshot_web_of_lies
+  - bbh_zeroshot_word_sorting
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+    filter_list: flexible-extract
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
-group: bbh_zeroshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test

--- a/lm_eval/tasks/belebele/_belebele.yaml
+++ b/lm_eval/tasks/belebele/_belebele.yaml
+group: belebele
+task:
+  - belebele_acm_Arab
+  - belebele_arz_Arab
+  - belebele_ceb_Latn
+  - belebele_fin_Latn
+  - belebele_hin_Deva
+  - belebele_ita_Latn
+  - belebele_khm_Khmr
+  - belebele_lvs_Latn
+  - belebele_npi_Deva
+  - belebele_pol_Latn
+  - belebele_slv_Latn
+  - belebele_swe_Latn
+  - belebele_tso_Latn
+  - belebele_xho_Latn
+  - belebele_afr_Latn
+  - belebele_asm_Beng
+  - belebele_ces_Latn
+  - belebele_fra_Latn
+  - belebele_hin_Latn
+  - belebele_jav_Latn
+  - belebele_kin_Latn
+  - belebele_mal_Mlym
+  - belebele_npi_Latn
+  - belebele_por_Latn
+  - belebele_sna_Latn
+  - belebele_swh_Latn
+  - belebele_tur_Latn
+  - belebele_yor_Latn
+  - belebele_als_Latn
+  - belebele_azj_Latn
+  - belebele_ckb_Arab
+  - belebele_fuv_Latn
+  - belebele_hrv_Latn
+  - belebele_jpn_Jpan
+  - belebele_kir_Cyrl
+  - belebele_mar_Deva
+  - belebele_nso_Latn
+  - belebele_snd_Arab
+  - belebele_tam_Taml
+  - belebele_ukr_Cyrl
+  - belebele_zho_Hans
+  - belebele_amh_Ethi
+  - belebele_bam_Latn
+  - belebele_dan_Latn
+  - belebele_gaz_Latn
+  - belebele_hun_Latn
+  - belebele_kac_Latn
+  - belebele_kor_Hang
+  - belebele_mkd_Cyrl
+  - belebele_nya_Latn
+  - belebele_ron_Latn
+  - belebele_som_Latn
+  - belebele_tel_Telu
+  - belebele_urd_Arab
+  - belebele_zho_Hant
+  - belebele_apc_Arab
+  - belebele_ben_Beng
+  - belebele_deu_Latn
+  - belebele_grn_Latn
+  - belebele_hye_Armn
+  - belebele_kan_Knda
+  - belebele_lao_Laoo
+  - belebele_mlt_Latn
+  - belebele_ory_Orya
+  - belebele_rus_Cyrl
+  - belebele_sot_Latn
+  - belebele_tgk_Cyrl
+  - belebele_urd_Latn
+  - belebele_zsm_Latn
+  - belebele_arb_Arab
+  - belebele_ben_Latn
+  - belebele_ell_Grek
+  - belebele_guj_Gujr
+  - belebele_ibo_Latn
+  - belebele_kat_Geor
+  - belebele_lin_Latn
+  - belebele_mri_Latn
+  - belebele_pan_Guru
+  - belebele_shn_Mymr
+  - belebele_spa_Latn
+  - belebele_tgl_Latn
+  - belebele_uzn_Latn
+  - belebele_zul_Latn
+  - belebele_arb_Latn
+  - belebele_bod_Tibt
+  - belebele_eng_Latn
+  - belebele_hat_Latn
+  - belebele_ilo_Latn
+  - belebele_kaz_Cyrl
+  - belebele_lit_Latn
+  - belebele_mya_Mymr
+  - belebele_pbt_Arab
+  - belebele_sin_Latn
+  - belebele_srp_Cyrl
+  - belebele_tha_Thai
+  - belebele_vie_Latn
+  - belebele_ars_Arab
+  - belebele_bul_Cyrl
+  - belebele_est_Latn
+  - belebele_hau_Latn
+  - belebele_ind_Latn
+  - belebele_kea_Latn
+  - belebele_lug_Latn
+  - belebele_nld_Latn
+  - belebele_pes_Arab
+  - belebele_sin_Sinh
+  - belebele_ssw_Latn
+  - belebele_tir_Ethi
+  - belebele_war_Latn
+  - belebele_ary_Arab
+  - belebele_cat_Latn
+  - belebele_eus_Latn
+  - belebele_heb_Hebr
+  - belebele_isl_Latn
+  - belebele_khk_Cyrl
+  - belebele_luo_Latn
+  - belebele_nob_Latn
+  - belebele_plt_Latn
+  - belebele_slk_Latn
+  - belebele_sun_Latn
+  - belebele_tsn_Latn
+  - belebele_wol_Latn
+aggregate_metric_list:
+  - aggregation: mean
+    metric: acc
+    weight_by_size: true
+  - aggregation: mean
+    metric: acc_norm
+    weight_by_size: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
-group: belebele
 dataset_path: facebook/belebele
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+
 import argparse
 import os

@@ -64,3 +65,36 @@ if __name__ == "__main__":
                allow_unicode=True,
                default_style='"',
            )
+
+    # write group config out
+
+    group_yaml_dict = {
+        "group": f"belebele_{args.task_prefix}"
+        if args.task_prefix != ""
+        else "belebele",
+        "task": [
+            (
+                f"belebele_{args.task_prefix}_{lang}"
+                if args.task_prefix != ""
+                else f"belebele_{lang}"
+            )
+            for lang in languages
+            if "default" not in lang
+        ],
+        "aggregate_metric_list": [
+            {"metric": "acc", "aggregation": "mean", "weight_by_size": False},
+            {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": False},
+        ],
+        "metadata": {"version": 0.0},
+    }
+
+    file_save_path = "_" + args.save_prefix_path + f"{args.task_prefix}.yaml"
+
+    with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
+        yaml.dump(
+            group_yaml_dict,
+            group_yaml_file,
+            width=float("inf"),
+            allow_unicode=True,
+            default_style='"',
+        )
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
@@ -4,50 +4,51 @@ task:
  # ANLI R1
  - group: anli_r1_flan
    group_alias: ANLI R1
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r1
+      - task: anli_r1_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -55,50 +56,51 @@ task:
  # ANLI R2
  - group: anli_r2_flan
    group_alias: ANLI R2
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r2
+      - task: anli_r2_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -106,50 +108,51 @@ task:
  # ANLI R3
  - group: anli_r3_flan
    group_alias: ANLI R3
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r3
+      - task: anli_r3_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -157,40 +160,41 @@ task:
  # Arc Easy
  - group: arc_easy_flan
    group_alias: Arc Easy
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: arc_easy
+      - task: arc_easy_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -198,40 +202,41 @@ task:
  # Arc Challenge
  - group: arc_challenge_flan
    group_alias: Arc Challenge
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: arc_challenge
+      - task: arc_challenge_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -239,55 +244,56 @@ task:
  # BoolQ
  - group: boolq_flan
    group_alias: BoolQ
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: boolq
+      - task: boolq_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-9
        task_alias: prompt-9
        include: _held_in_template_yaml
        doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
@@ -295,50 +301,51 @@ task:
  # RTE
  - group: rte_flan
    group_alias: RTE
-    group_config:
-      aggregate_metric: True
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: rte
+      - task: rte_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"

--- a/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/lm_eval/tasks/benchmarks/minerva_math.yaml
@@ -7,3 +7,9 @@ task:
  - minerva_math_num_theory
  - minerva_math_prealgebra
  - minerva_math_precalc
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+++ b/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
@@ -15,3 +15,7 @@ task:
    task_alias: "professional_medicine (mmlu)"
  - task: mmlu_college_biology
    task_alias: "college_biology (mmlu)"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: True
--- a/lm_eval/tasks/bertaqa/README.md
+++ b/lm_eval/tasks/bertaqa/README.md
+# BertaQA
+
+### Paper
+
+Title: BertaQA: How Much Do Language Models Know About Local Culture?
+
+Abstract: https://arxiv.org/abs/2406.07302
+
+Large Language Models (LLMs) exhibit extensive knowledge about the world, but most evaluations have been limited to global or anglocentric subjects. This raises the question of how well these models perform on topics relevant to other cultures, whose presence on the web is not that prominent. To address this gap, we introduce BertaQA, a multiple-choice trivia dataset that is parallel in English and Basque. The dataset consists of a local subset with questions pertinent to the Basque culture, and a global subset with questions of broader interest. We find that state-of-the-art LLMs struggle with local cultural knowledge, even as they excel on global topics. However, we show that continued pre-training in Basque significantly improves the models' performance on Basque culture, even when queried in English. To our knowledge, this is the first solid evidence of knowledge transfer from a low-resource to a high-resource language. Our analysis sheds light on the complex interplay between language and knowledge, and reveals that some prior findings do not fully hold when reassessed on local topics. Our dataset and evaluation code are available under open licenses at https://github.com/juletx/BertaQA.
+
+Homepage: https://github.com/juletx/BertaQA
+
+### Citation
+
+```
+@misc{etxaniz2024bertaqa,
+      title={BertaQA: How Much Do Language Models Know About Local Culture?},
+      author={Julen Etxaniz and Gorka Azkune and Aitor Soroa and Oier Lopez de Lacalle and Mikel Artetxe},
+      year={2024},
+      eprint={2406.07302},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `bertaqa`: Group of BertaQA tasks.
+
+#### Tasks
+
+- `bertaqa_eu`: Trivia questions in Basque.
+- `bertaqa_en`: Trivia questions in English, human-translated from Basque.
+- `bertaqa_en_mt_*`: Trivia questions in English, machine-translated from Basque with different models.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+- [ ] Is the task an existing benchmark in the literature?
+  - [ ] Have you referenced the original paper that introduced the task?
+  - [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+
+- [ ] Is the "Main" variant of this task clearly denoted?
+- [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+- [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/bertaqa/_bertaqa_template
+++ b/lm_eval/tasks/bertaqa/_bertaqa_template
+tag: bertaqa
+dataset_path: HiTZ/BertaQA
+dataset_name: null
+validation_split: null
+test_split: test
+fewshot_split: test
+output_type: multiple_choice
+doc_to_choice: ["A", "B", "C"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/bertaqa/bertaqa_en.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en.yaml
+task: bertaqa_en
+include: _bertaqa_template
+dataset_name: en
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_gemma-7b.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_gemma-7b.yaml
+task: bertaqa_en_mt_gemma-7b
+include: _bertaqa_template
+dataset_name: en_mt_gemma-7b
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_hitz.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_hitz.yaml
+task: bertaqa_en_mt_hitz
+include: _bertaqa_template
+dataset_name: en_mt_hitz
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_itzuli.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_itzuli.yaml
+task: bertaqa_en_mt_itzuli
+include: _bertaqa_template
+dataset_name: en_mt_itzuli
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.1.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.1.yaml
+task: bertaqa_en_mt_latxa-13b-v1.1
+include: _bertaqa_template
+dataset_name: en_mt_latxa-13b-v1.1
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.yaml
+task: bertaqa_en_mt_latxa-13b-v1
+include: _bertaqa_template
+dataset_name: en_mt_latxa-13b-v1
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.1.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.1.yaml
+task: bertaqa_en_mt_latxa-70b-v1.1
+include: _bertaqa_template
+dataset_name: en_mt_latxa-70b-v1.1
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.yaml
+task: bertaqa_en_mt_latxa-70b-v1
+include: _bertaqa_template
+dataset_name: en_mt_latxa-70b-v1
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
--- a/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.1.yaml
+++ b/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.1.yaml
+task: bertaqa_en_mt_latxa-7b-v1.1
+include: _bertaqa_template
+dataset_name: en_mt_latxa-7b-v1.1
+doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"