Merge branch 'main' into autobatchtest

# Conflicts: # lm_eval/models/huggingface.py

Merge branch 'main' into autobatchtest
# Conflicts: # lm_eval/models/huggingface.py
948f120f · Baber · a5b1c7a8 · bd80a6c0 · 948f120f · 948f120f
Commit 948f120f authored Nov 09, 2024 by Baber
20 changed files
--- a/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
+++ b/lm_eval/tasks/belebele/belebele_zho_Hans.yaml
-"fewshot_split": "zho_Hans"
-"include": "_default_template_yaml"
-"task": "belebele_zho_Hans"
-"test_split": "zho_Hans"
+dataset_name: zho_Hans
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zho_Hans
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
+++ b/lm_eval/tasks/belebele/belebele_zho_Hant.yaml
-"fewshot_split": "zho_Hant"
-"include": "_default_template_yaml"
-"task": "belebele_zho_Hant"
-"test_split": "zho_Hant"
+dataset_name: zho_Hant
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zho_Hant
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml
-"fewshot_split": "zsm_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_zsm_Latn"
-"test_split": "zsm_Latn"
+dataset_name: zsm_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zsm_Latn
+test_split: test
--- a/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_zul_Latn.yaml
-"fewshot_split": "zul_Latn"
-"include": "_default_template_yaml"
-"task": "belebele_zul_Latn"
-"test_split": "zul_Latn"
+dataset_name: zul_Latn
+fewshot_split: test
+include: _default_template_yaml
+task: belebele_zul_Latn
+test_split: test
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
@@ -4,48 +4,51 @@ task:
  # ANLI R1
  - group: anli_r1_flan
    group_alias: ANLI R1
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r1
+      - task: anli_r1_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r1
+      - task: anli_r1_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -53,48 +56,51 @@ task:
  # ANLI R2
  - group: anli_r2_flan
    group_alias: ANLI R2
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r2
+      - task: anli_r2_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r2
+      - task: anli_r2_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -102,48 +108,51 @@ task:
  # ANLI R3
  - group: anli_r3_flan
    group_alias: ANLI R3
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: anli_r3
+      - task: anli_r3_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-      - task: anli_r3
+      - task: anli_r3_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
@@ -151,38 +160,41 @@ task:
  # Arc Easy
  - group: arc_easy_flan
    group_alias: Arc Easy
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: arc_easy
+      - task: arc_easy_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_easy
+      - task: arc_easy_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -190,38 +202,41 @@ task:
  # Arc Challenge
  - group: arc_challenge_flan
    group_alias: Arc Challenge
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: arc_challenge
+      - task: arc_challenge_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-      - task: arc_challenge
+      - task: arc_challenge_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
@@ -229,53 +244,56 @@ task:
  # BoolQ
  - group: boolq_flan
    group_alias: BoolQ
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: boolq
+      - task: boolq_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-2
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
        doc_to_target: "{{['no', 'yes'][label]}}"
-      - task: boolq
+      - task: boolq_prompt-9
        task_alias: prompt-9
        include: _held_in_template_yaml
        doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
@@ -283,48 +301,51 @@ task:
  # RTE
  - group: rte_flan
    group_alias: RTE
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
    task:
-      - task: rte
+      - task: rte_prompt-0
        task_alias: prompt-0
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
        task_alias: prompt-1
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-1
        task_alias: prompt-2
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-3
        task_alias: prompt-3
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-4
        task_alias: prompt-4
        include: _held_in_template_yaml
        doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-5
        task_alias: prompt-5
        include: _held_in_template_yaml
        doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-6
        task_alias: prompt-6
        include: _held_in_template_yaml
        doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-7
        task_alias: prompt-7
        include: _held_in_template_yaml
        doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
        doc_to_target: "{{['yes', 'no'][label]}}"
-      - task: rte
+      - task: rte_prompt-8
        task_alias: prompt-8
        include: _held_in_template_yaml
        doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"

--- a/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/lm_eval/tasks/benchmarks/minerva_math.yaml
@@ -7,3 +7,9 @@ task:
  - minerva_math_num_theory
  - minerva_math_prealgebra
  - minerva_math_precalc
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+++ b/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
@@ -15,3 +15,7 @@ task:
    task_alias: "professional_medicine (mmlu)"
  - task: mmlu_college_biology
    task_alias: "college_biology (mmlu)"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: True
--- a/lm_eval/tasks/bertaqa/_bertaqa_template
+++ b/lm_eval/tasks/bertaqa/_bertaqa_template
-group: bertaqa
+tag: bertaqa
 dataset_path: HiTZ/BertaQA
 dataset_name: null
 validation_split: null

--- a/lm_eval/tasks/blimp/_blimp.yaml
+++ b/lm_eval/tasks/blimp/_blimp.yaml
+group: blimp
+task:
+  - "blimp_adjunct_island"
+  - "blimp_anaphor_gender_agreement"
+  - "blimp_anaphor_number_agreement"
+  - "blimp_animate_subject_passive"
+  - "blimp_animate_subject_trans"
+  - "blimp_causative"
+  - "blimp_complex_NP_island"
+  - "blimp_coordinate_structure_constraint_complex_left_branch"
+  - "blimp_coordinate_structure_constraint_object_extraction"
+  - "blimp_determiner_noun_agreement_1"
+  - "blimp_determiner_noun_agreement_2"
+  - "blimp_determiner_noun_agreement_irregular_1"
+  - "blimp_determiner_noun_agreement_irregular_2"
+  - "blimp_determiner_noun_agreement_with_adj_2"
+  - "blimp_determiner_noun_agreement_with_adj_irregular_1"
+  - "blimp_determiner_noun_agreement_with_adj_irregular_2"
+  - "blimp_determiner_noun_agreement_with_adjective_1"
+  - "blimp_distractor_agreement_relational_noun"
+  - "blimp_distractor_agreement_relative_clause"
+  - "blimp_drop_argument"
+  - "blimp_ellipsis_n_bar_1"
+  - "blimp_ellipsis_n_bar_2"
+  - "blimp_existential_there_object_raising"
+  - "blimp_existential_there_quantifiers_1"
+  - "blimp_existential_there_quantifiers_2"
+  - "blimp_existential_there_subject_raising"
+  - "blimp_expletive_it_object_raising"
+  - "blimp_inchoative"
+  - "blimp_intransitive"
+  - "blimp_irregular_past_participle_adjectives"
+  - "blimp_irregular_past_participle_verbs"
+  - "blimp_irregular_plural_subject_verb_agreement_1"
+  - "blimp_irregular_plural_subject_verb_agreement_2"
+  - "blimp_left_branch_island_echo_question"
+  - "blimp_left_branch_island_simple_question"
+  - "blimp_matrix_question_npi_licensor_present"
+  - "blimp_npi_present_1"
+  - "blimp_npi_present_2"
+  - "blimp_only_npi_licensor_present"
+  - "blimp_only_npi_scope"
+  - "blimp_passive_1"
+  - "blimp_passive_2"
+  - "blimp_principle_A_c_command"
+  - "blimp_principle_A_case_1"
+  - "blimp_principle_A_case_2"
+  - "blimp_principle_A_domain_1"
+  - "blimp_principle_A_domain_2"
+  - "blimp_principle_A_domain_3"
+  - "blimp_principle_A_reconstruction"
+  - "blimp_regular_plural_subject_verb_agreement_1"
+  - "blimp_regular_plural_subject_verb_agreement_2"
+  - "blimp_sentential_negation_npi_licensor_present"
+  - "blimp_sentential_negation_npi_scope"
+  - "blimp_sentential_subject_island"
+  - "blimp_superlative_quantifiers_1"
+  - "blimp_superlative_quantifiers_2"
+  - "blimp_tough_vs_raising_1"
+  - "blimp_tough_vs_raising_2"
+  - "blimp_transitive"
+  - "blimp_wh_island"
+  - "blimp_wh_questions_object_gap"
+  - "blimp_wh_questions_subject_gap"
+  - "blimp_wh_questions_subject_gap_long_distance"
+  - "blimp_wh_vs_that_no_gap"
+  - "blimp_wh_vs_that_no_gap_long_distance"
+  - "blimp_wh_vs_that_with_gap"
+  - "blimp_wh_vs_that_with_gap_long_distance"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: False
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/blimp/_template_yaml
+++ b/lm_eval/tasks/blimp/_template_yaml
-group: blimp
 dataset_path: blimp
 output_type: multiple_choice
 validation_split: train

--- a/lm_eval/tasks/catalan_bench/README.md
+++ b/lm_eval/tasks/catalan_bench/README.md
+# CatalanBench
+
+### Paper
+
+CatalanBench is a benchmark for evaluating language models in Catalan tasks. This is, it evaluates the ability of a language model to understand and generate Catalan text. CatalanBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of CatalanBench will be published in a paper soon.
+
+The new evaluation datasets included in CatalanBench are:
+| Task          | Category       | Homepage  |
+|:-------------:|:-----:|:-----:|
+| ARC_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/arc_ca |
+| MGSM_ca | Math | https://huggingface.co/datasets/projecte-aina/mgsm_ca |
+| OpenBookQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/openbookqa_ca |
+| Parafraseja | Paraphrasing | https://huggingface.co/datasets/projecte-aina/Parafraseja |
+| PIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/piqa_ca |
+| SIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/siqa_ca |
+| XStoryCloze_ca | Commonsense Reasoning | https://huggingface.co/datasets/projecte-aina/xstorycloze_ca |
+
+The datasets included in CatalanBench that have been made public in previous pubications are:
+
+| Task          | Category       | Paper title          | Homepage  |
+|:-------------:|:-----:|:-------------:|:-----:|
+| Belebele_ca | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
+| caBREU | Summarization | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/caBreu |
+| CatalanQA | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/catalanqa |
+| CatCoLA | Linguistic Acceptability | CatCoLA: Catalan Corpus of Linguistic Acceptability | https://huggingface.co/datasets/nbel/CatCoLA |
+| COPA-ca | Commonsense Reasoning | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/COPA-ca |
+| CoQCat | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/CoQCat |
+| FLORES_ca | Translation | [The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
+| PAWS-ca | Paraphrasing | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/PAWS-ca |
+| TE-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/teca |
+| VeritasQA_ca | Truthfulness | VeritasQA: A Truthfulness Benchmark Aimed at Multilingual Transferability | TBA |
+| WNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/wnli-ca |
+| XNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xnli-ca |
+| XQuAD-ca | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xquad-ca |
+
+
+### Citation
+Paper for CatalanBench coming soon.
+
+<!--```bibtex
+@inproceedings{baucells-2024-iberobench,
+    title = "IberoBench: A Benchmark for LLM Evaluation in Iberian Languages",
+    author = "Baucells, Irene  and
+      AUTHORS, ADD",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
+    year = "2024",
+    publisher = "Association for Computational Linguistics",
+}
+```
+-->
+
+### Groups and Tasks
+
+#### Groups
+
+- `catalan_bench`: All tasks included in CatalanBench.
+- `flores_ca`: All FLORES translation tasks from or to Catalan.
+
+#### Tags
+- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme).
+- `phrases_va`: Two Phrases_va tasks for language adaptation between Catalan and Valencian.
+
+#### Tasks
+
+The following tasks evaluate tasks on CatalanBench dataset using various scoring methods.
+  - `arc_ca_challenge`
+  - `arc_ca_easy`
+  - `belebele_cat_Latn`
+  - `cabreu`
+  - `catalanqa`
+  - `catcola`
+  - `copa_ca`
+  - `coqcat`
+  - `flores_ca`
+  - `flores_ca-de`
+  - `flores_ca-en`
+  - `flores_ca-es`
+  - `flores_ca-eu`
+  - `flores_ca-fr`
+  - `flores_ca-gl`
+  - `flores_ca-it`
+  - `flores_ca-pt`
+  - `flores_de-ca`
+  - `flores_en-ca`
+  - `flores_es-ca`
+  - `flores_eu-ca`
+  - `flores_fr-ca`
+  - `flores_gl-ca`
+  - `flores_it-ca`
+  - `flores_pt-ca`
+  - `mgsm_direct_ca`
+  - `openbookqa_ca`
+  - `parafraseja`
+  - `paws_ca`
+  - `phrases_ca`
+  - `piqa_ca`
+  - `siqa_ca`
+  - `teca`
+  - `veritasqa_gen_ca`
+  - `veritasqa_mc1_ca`
+  - `veritasqa_mc2_ca`
+  - `wnli_ca`
+  - `xnli_ca`
+  - `xquad_ca`
+  - `xstorycloze_ca`
+
+Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
+- `belebele_cat_Latn`: Belebele Catalan
+
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
+++ b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
+tag: arc_ca
+dataset_path: projecte-aina/arc_ca
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: "Pregunta: {{question}}\nResposta:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Pregunta: {{question}}\nResposta:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
+++ b/lm_eval/tasks/catalan_bench/_cabreu_common_yaml
+tag: cabreu
+dataset_path: projecte-aina/caBreu
+dataset_name: null
+output_type: generate_until
+test_split: test
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_doc_cabreu
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    aggregation: !function utils.rouge1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml
+++ b/lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml
+task: arc_ca_challenge
+dataset_name: ARC-Challenge
+include: _arc_ca_common_yaml
--- a/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml
+++ b/lm_eval/tasks/catalan_bench/arc_ca_easy.yaml
+task: arc_ca_easy
+dataset_name: ARC-Easy
+include: _arc_ca_common_yaml
--- a/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml
+++ b/lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml
+include: _cabreu_common_yaml
+task: cabreu_abstractive
+description: "Examina el text següent i genera'n un resum abstractiu, expressant el significat del text original d'una manera més natural i concisa.\n"
+doc_to_text: >-
+  Text: {{content}}
+
+  Resum:
+doc_to_target: '{{summaries["abstractive"]["a1"]}}'
--- a/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml
+++ b/lm_eval/tasks/catalan_bench/cabreu_extractive.yaml
+include: _cabreu_common_yaml
+task: cabreu_extractive
+description: "Examina el text següent i genera'n un resum extractiu, utilitzant les frases o oracions més rellevants del text original.\n"
+doc_to_text: >-
+  Text: {{content}}
+
+  Resum:
+doc_to_target: '{{summaries["extractive"]["a1"]}}'
--- a/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml
+++ b/lm_eval/tasks/catalan_bench/cabreu_extreme.yaml
+include: _cabreu_common_yaml
+task: cabreu_extreme
+description: "Examina el text següent i genera'n un resum que sigui el més concís possible i que preservi el significat del text original.\n"
+doc_to_text: >-
+  Text: {{content}}
+
+  Resum:
+doc_to_target: '{{summaries["extreme"]["a1"]}}'
--- a/lm_eval/tasks/catalan_bench/catalan_bench.yaml
+++ b/lm_eval/tasks/catalan_bench/catalan_bench.yaml
+group: catalan_bench
+task:
+    - belebele_cat_Latn
+    - xnli_ca
+    - catcola
+    - copa_ca
+    - openbookqa_ca
+    - parafraseja
+    - paws_ca
+    - piqa_ca
+    - siqa_ca
+    - teca
+    - wnli_ca
+    - arc_ca_easy
+    - arc_ca_challenge
+    - xstorycloze_ca
+    - xquad_ca
+    - catalanqa
+    - coqcat
+    - flores_ca
+    - cabreu
+    - mgsm_direct_ca
+    - phrases_va
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/catalan_bench/catalanqa.yaml
+++ b/lm_eval/tasks/catalan_bench/catalanqa.yaml
+task: catalanqa
+dataset_path: projecte-aina/catalanqa
+dataset_name: null
+output_type: generate_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:"
+doc_to_target: '{{answers[0]["text"]}}'
+target_delimiter: ' '
+process_results: !function utils.process_results_qa
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0