add more explicit aggregation groups

46e8c8e6 · haileyschoelkopf · a382359c · 46e8c8e6 · 46e8c8e6 · 46e8c8e6
Commit 46e8c8e6 authored Jun 21, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups

--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
 ### Citation
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
 #### Tasks

--- a/lm_eval/tasks/agieval/README.md
+++ b/lm_eval/tasks/agieval/README.md
@@ -75,7 +75,7 @@ Please make sure to cite all the individual datasets in your paper when you use
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
@@ -89,6 +89,10 @@ Please make sure to cite all the individual datasets in your paper when you use
 - `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
+#### Tags
+None.
 #### Tasks
 - `agieval_aqua_rat`

--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
-group:
+tag:
  - anli
 task: anli_r1
 dataset_path: anli

--- a/lm_eval/tasks/arc/README.md
+++ b/lm_eval/tasks/arc/README.md
@@ -29,10 +29,14 @@ Homepage: https://allenai.org/data/arc
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
+None.
+#### Tags
 * `ai2_arc`: Evaluates `arc_easy` and `arc_challenge`
 #### Tasks

--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
-group:
+tag:
  - ai2_arc
 task: arc_easy
 dataset_path: allenai/ai2_arc

--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -27,9 +27,9 @@ Homepage: https://github.com/openai/gpt-3/tree/master/data
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
-#### Groups
+#### Tags
 * `arithmetic`: Evaluates `1dc` to `5ds`

--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
-group:
+tag:
  - arithmetic
 task: arithmetic_1dc
 dataset_path: EleutherAI/arithmetic

--- a/lm_eval/tasks/asdiv/README.md
+++ b/lm_eval/tasks/asdiv/README.md
@@ -32,7 +32,7 @@ Homepage: https://github.com/chaochun/nlu-asdiv-dataset
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups

--- a/lm_eval/tasks/babi/README.md
+++ b/lm_eval/tasks/babi/README.md
@@ -21,12 +21,16 @@ Homepage: https://github.com/facebookarchive/bAbI-tasks
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
 * Not part of a group yet
+#### Tags
+* No tags applied.
 #### Tasks
 * `babi`

--- a/lm_eval/tasks/basqueglue/README.md
+++ b/lm_eval/tasks/basqueglue/README.md
@@ -43,11 +43,15 @@ Homepage: `https://github.com/hitz-zentroa/latxa`
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
-* `basque-glue`: First version of the implementation
+None.
+#### Tags
+* `basque-glue`: First version of the implementation. Calls all subtasks, but does not average.
 #### Tasks

--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -21,15 +21,19 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
+- `bbh`: is the same as `bbh_cot_fewshot`.
 - `bbh_zeroshot`
 - `bbh_fewshot`
 - `bbh_cot_fewshot`
 - `bbh_cot_zeroshot`
+#### Tags
+None.
 #### Tasks

--- a/lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml
+group: bbh
+task:
+  - bbh_cot_fewshot_boolean_expressions
+  - bbh_cot_fewshot_causal_judgement
+  - bbh_cot_fewshot_date_understanding
+  - bbh_cot_fewshot_disambiguation_qa
+  - bbh_cot_fewshot_dyck_languages
+  - bbh_cot_fewshot_formal_languages
+  - bbh_cot_fewshot_geometric_shapes
+  - bbh_cot_fewshot_hyperbaton
+  - bbh_cot_fewshot_logical_deduction_five_objects
+  - bbh_cot_fewshot_logical_deduction_seven_objects
+  - bbh_cot_fewshot_logical_deduction_three_objects
+  - bbh_cot_fewshot_movie_recommendation
+  - bbh_cot_fewshot_multistep_arithmetic_two
+  - bbh_cot_fewshot_navigate
+  - bbh_cot_fewshot_object_counting
+  - bbh_cot_fewshot_penguins_in_a_table
+  - bbh_cot_fewshot_reasoning_about_colored_objects
+  - bbh_cot_fewshot_ruin_names
+  - bbh_cot_fewshot_salient_translation_error_detection
+  - bbh_cot_fewshot_snarks
+  - bbh_cot_fewshot_sports_understanding
+  - bbh_cot_fewshot_temporal_sequences
+  - bbh_cot_fewshot_tracking_shuffled_objects_five_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_three_objects
+  - bbh_cot_fewshot_web_of_lies
+  - bbh_cot_fewshot_word_sorting
+aggregate_metric:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml
+group: bbh_cot_fewshot
+task:
+  - bbh_cot_fewshot_boolean_expressions
+  - bbh_cot_fewshot_causal_judgement
+  - bbh_cot_fewshot_date_understanding
+  - bbh_cot_fewshot_disambiguation_qa
+  - bbh_cot_fewshot_dyck_languages
+  - bbh_cot_fewshot_formal_languages
+  - bbh_cot_fewshot_geometric_shapes
+  - bbh_cot_fewshot_hyperbaton
+  - bbh_cot_fewshot_logical_deduction_five_objects
+  - bbh_cot_fewshot_logical_deduction_seven_objects
+  - bbh_cot_fewshot_logical_deduction_three_objects
+  - bbh_cot_fewshot_movie_recommendation
+  - bbh_cot_fewshot_multistep_arithmetic_two
+  - bbh_cot_fewshot_navigate
+  - bbh_cot_fewshot_object_counting
+  - bbh_cot_fewshot_penguins_in_a_table
+  - bbh_cot_fewshot_reasoning_about_colored_objects
+  - bbh_cot_fewshot_ruin_names
+  - bbh_cot_fewshot_salient_translation_error_detection
+  - bbh_cot_fewshot_snarks
+  - bbh_cot_fewshot_sports_understanding
+  - bbh_cot_fewshot_temporal_sequences
+  - bbh_cot_fewshot_tracking_shuffled_objects_five_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects
+  - bbh_cot_fewshot_tracking_shuffled_objects_three_objects
+  - bbh_cot_fewshot_web_of_lies
+  - bbh_cot_fewshot_word_sorting
+aggregate_metric:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
-group:
- bbh
- bbh_cot_fewshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test

--- a/lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml
+group: bbh_cot_zeroshot
+task:
+  - bbh_cot_zeroshot_boolean_expressions
+  - bbh_cot_zeroshot_causal_judgement
+  - bbh_cot_zeroshot_date_understanding
+  - bbh_cot_zeroshot_disambiguation_qa
+  - bbh_cot_zeroshot_dyck_languages
+  - bbh_cot_zeroshot_formal_languages
+  - bbh_cot_zeroshot_geometric_shapes
+  - bbh_cot_zeroshot_hyperbaton
+  - bbh_cot_zeroshot_logical_deduction_five_objects
+  - bbh_cot_zeroshot_logical_deduction_seven_objects
+  - bbh_cot_zeroshot_logical_deduction_three_objects
+  - bbh_cot_zeroshot_movie_recommendation
+  - bbh_cot_zeroshot_multistep_arithmetic_two
+  - bbh_cot_zeroshot_navigate
+  - bbh_cot_zeroshot_object_counting
+  - bbh_cot_zeroshot_penguins_in_a_table
+  - bbh_cot_zeroshot_reasoning_about_colored_objects
+  - bbh_cot_zeroshot_ruin_names
+  - bbh_cot_zeroshot_salient_translation_error_detection
+  - bbh_cot_zeroshot_snarks
+  - bbh_cot_zeroshot_sports_understanding
+  - bbh_cot_zeroshot_temporal_sequences
+  - bbh_cot_zeroshot_tracking_shuffled_objects_five_objects
+  - bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects
+  - bbh_cot_zeroshot_tracking_shuffled_objects_three_objects
+  - bbh_cot_zeroshot_web_of_lies
+  - bbh_cot_zeroshot_word_sorting
+aggregate_metric:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
-group: bbh_cot_zeroshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test

--- a/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml
+++ b/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml
+group: bbh_fewshot
+task:
+  - bbh_fewshot_boolean_expressions
+  - bbh_fewshot_causal_judgement
+  - bbh_fewshot_date_understanding
+  - bbh_fewshot_disambiguation_qa
+  - bbh_fewshot_dyck_languages
+  - bbh_fewshot_formal_languages
+  - bbh_fewshot_geometric_shapes
+  - bbh_fewshot_hyperbaton
+  - bbh_fewshot_logical_deduction_five_objects
+  - bbh_fewshot_logical_deduction_seven_objects
+  - bbh_fewshot_logical_deduction_three_objects
+  - bbh_fewshot_movie_recommendation
+  - bbh_fewshot_multistep_arithmetic_two
+  - bbh_fewshot_navigate
+  - bbh_fewshot_object_counting
+  - bbh_fewshot_penguins_in_a_table
+  - bbh_fewshot_reasoning_about_colored_objects
+  - bbh_fewshot_ruin_names
+  - bbh_fewshot_salient_translation_error_detection
+  - bbh_fewshot_snarks
+  - bbh_fewshot_sports_understanding
+  - bbh_fewshot_temporal_sequences
+  - bbh_fewshot_tracking_shuffled_objects_five_objects
+  - bbh_fewshot_tracking_shuffled_objects_seven_objects
+  - bbh_fewshot_tracking_shuffled_objects_three_objects
+  - bbh_fewshot_web_of_lies
+  - bbh_fewshot_word_sorting
+aggregate_metric:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
-group: bbh_fewshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test

--- a/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml
+group: bbh_zeroshot
+task:
+  - bbh_zeroshot_boolean_expressions
+  - bbh_zeroshot_causal_judgement
+  - bbh_zeroshot_date_understanding
+  - bbh_zeroshot_disambiguation_qa
+  - bbh_zeroshot_dyck_languages
+  - bbh_zeroshot_formal_languages
+  - bbh_zeroshot_geometric_shapes
+  - bbh_zeroshot_hyperbaton
+  - bbh_zeroshot_logical_deduction_five_objects
+  - bbh_zeroshot_logical_deduction_seven_objects
+  - bbh_zeroshot_logical_deduction_three_objects
+  - bbh_zeroshot_movie_recommendation
+  - bbh_zeroshot_multistep_arithmetic_two
+  - bbh_zeroshot_navigate
+  - bbh_zeroshot_object_counting
+  - bbh_zeroshot_penguins_in_a_table
+  - bbh_zeroshot_reasoning_about_colored_objects
+  - bbh_zeroshot_ruin_names
+  - bbh_zeroshot_salient_translation_error_detection
+  - bbh_zeroshot_snarks
+  - bbh_zeroshot_sports_understanding
+  - bbh_zeroshot_temporal_sequences
+  - bbh_zeroshot_tracking_shuffled_objects_five_objects
+  - bbh_zeroshot_tracking_shuffled_objects_seven_objects
+  - bbh_zeroshot_tracking_shuffled_objects_three_objects
+  - bbh_zeroshot_web_of_lies
+  - bbh_zeroshot_word_sorting
+aggregate_metric:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2.0