Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
+++ b/lm_eval/tasks/belebele/belebele_tha_Thai.yaml
-"fewshot_split": "tha_Thai"
-"include": "_default_template_yaml"
-"task": "belebele_tha_Thai"
-"test_split": "tha_Thai"
+dataset_name: tha_Thai
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_tha_Thai
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
+++ b/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml
-"fewshot_split": "tir_Ethi"
-"include": "_default_template_yaml"
-"task": "belebele_tir_Ethi"
-"test_split": "tir_Ethi"
+dataset_name: tir_Ethi
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_tir_Ethi
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml
-"fewshot_split": "tsn_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_tsn_Latn"
-"test_split": "tsn_Latn"
+dataset_name: tsn_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_tsn_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tso_Latn.yaml
-"fewshot_split": "tso_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_tso_Latn"
-"test_split": "tso_Latn"
+dataset_name: tso_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_tso_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_tur_Latn.yaml
-"fewshot_split": "tur_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_tur_Latn"
-"test_split": "tur_Latn"
+dataset_name: tur_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_tur_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
+++ b/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml
-"fewshot_split": "ukr_Cyrl"
-"include": "_default_template_yaml"
-"task": "belebele_ukr_Cyrl"
-"test_split": "ukr_Cyrl"
+dataset_name: ukr_Cyrl
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_ukr_Cyrl
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_urd_Arab.yaml
-"fewshot_split": "urd_Arab"
-"include": "_default_template_yaml"
-"task": "belebele_urd_Arab"
-"test_split": "urd_Arab"
+dataset_name: urd_Arab
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_urd_Arab
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_urd_Latn.yaml
-"fewshot_split": "urd_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_urd_Latn"
-"test_split": "urd_Latn"
+dataset_name: urd_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_urd_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml
-"fewshot_split": "uzn_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_uzn_Latn"
-"test_split": "uzn_Latn"
+dataset_name: uzn_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_uzn_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_vie_Latn.yaml
-"fewshot_split": "vie_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_vie_Latn"
-"test_split": "vie_Latn"
+dataset_name: vie_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_vie_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_war_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_war_Latn.yaml
-"fewshot_split": "war_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_war_Latn"
-"test_split": "war_Latn"
+dataset_name: war_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_war_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_wol_Latn.yaml
-"fewshot_split": "wol_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_wol_Latn"
-"test_split": "wol_Latn"
+dataset_name: wol_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_wol_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_xho_Latn.yaml
-"fewshot_split": "xho_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_xho_Latn"
-"test_split": "xho_Latn"
+dataset_name: xho_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_xho_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_yor_Latn.yaml
-"fewshot_split": "yor_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_yor_Latn"
-"test_split": "yor_Latn"
+dataset_name: yor_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_yor_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
+++ b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
-"fewshot_split": "zho_Hans"
-"include": "_default_template_yaml"
-"task": "belebele_zho_Hans"
-"test_split": "zho_Hans"
+dataset_name: zho_Hans
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zho_Hans
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
+++ b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
-"fewshot_split": "zho_Hant"
-"include": "_default_template_yaml"
-"task": "belebele_zho_Hant"
-"test_split": "zho_Hant"
+dataset_name: zho_Hant
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zho_Hant
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
-"fewshot_split": "zsm_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_zsm_Latn"
-"test_split": "zsm_Latn"
+dataset_name: zsm_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zsm_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
-"fewshot_split": "zul_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_zul_Latn"
-"test_split": "zul_Latn"
+dataset_name: zul_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zul_Latn
+test_split: test
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
@@ -4,48 +4,51 @@ task:
  # ANLI R1
  - group: anli_r1_flan
    group_alias: ANLI R1
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r1
+      - task: anli_r1_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -53,48 +56,51 @@ task:
  # ANLI R2
  - group: anli_r2_flan
    group_alias: ANLI R2
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r2
+      - task: anli_r2_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -102,48 +108,51 @@ task:
  # ANLI R3
  - group: anli_r3_flan
    group_alias: ANLI R3
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r3
+      - task: anli_r3_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -151,38 +160,41 @@ task:
  # Arc Easy
  - group: arc_easy_flan
    group_alias: Arc Easy
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: arc_easy
+      - task: arc_easy_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -190,38 +202,41 @@ task:
  # Arc Challenge
  - group: arc_challenge_flan
    group_alias: Arc Challenge
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: arc_challenge
+      - task: arc_challenge_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -229,53 +244,56 @@ task:
  # BoolQ
  - group: boolq_flan
    group_alias: BoolQ
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: boolq
+      - task: boolq_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-9
        task_alias: prompt-9
        include: _held_in_template_yaml
        doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
@@ -283,48 +301,51 @@ task:
  # RTE
  - group: rte_flan
    group_alias: RTE
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: rte
+      - task: rte_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"

--- a/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/lm_eval/tasks/benchmarks/minerva_math.yaml
@@ -7,3 +7,9 @@ task:
  - minerva_math_num_theory
  - minerva_math_prealgebra
  - minerva_math_precalc
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0