Merge pull request #1029 from EleutherAI/bbh-fixup

[Refactor] BBH fixup

Merge pull request #1029 from EleutherAI/bbh-fixup
[Refactor] BBH fixup
bf26d979 · Lintang Sutawika · GitHub · e7afee52 · 3b9640b8 · bf26d979
Unverified Commit bf26d979 authored Nov 28, 2023 by Lintang Sutawika Committed by GitHub Nov 28, 2023
20 changed files
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_seven_objects.yaml
 "dataset_name": "logical_deduction_seven_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_logical_deduction_seven_objects"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_logical_deduction_seven_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/logical_deduction_three_objects.yaml
 "dataset_name": "logical_deduction_three_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_logical_deduction_three_objects"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_logical_deduction_three_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/movie_recommendation.yaml
 "dataset_name": "movie_recommendation"
 "description": "Recommend movies similar to the given list of movies.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_movie_recommendation"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_movie_recommendation"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/multistep_arithmetic_two.yaml
 "dataset_name": "multistep_arithmetic_two"
 "description": "Solve multi-step arithmetic problems.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_multistep_arithmetic_two"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_multistep_arithmetic_two"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_navigate"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_navigate"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_object_counting"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_object_counting"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_penguins_in_a_table"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_penguins_in_a_table"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_reasoning_about_colored_objects"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_reasoning_about_colored_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_ruin_names"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_ruin_names"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
 "dataset_name": "salient_translation_error_detection"
 "description": "Detect the type of error in an English translation of a German source sentence.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_salient_translation_error_detection"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_salient_translation_error_detection"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
 "dataset_name": "snarks"
 "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_snarks"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_snarks"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
 "dataset_name": "sports_understanding"
 "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_sports_understanding"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_sports_understanding"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
 "dataset_name": "temporal_sequences"
 "description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_temporal_sequences"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_temporal_sequences"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
 "dataset_name": "tracking_shuffled_objects_five_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_five_objects"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_tracking_shuffled_objects_five_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_seven_objects"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_three_objects"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_tracking_shuffled_objects_three_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_web_of_lies"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_web_of_lies"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
 "doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
-"include": "_flan_cot_zeroshot_template_yaml"
-"task": "bbh_flan_cot_zeroshot_word_sorting"
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_word_sorting"
--- a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
-group: bbh_flan_fewshot
+group: bbh_fewshot
 dataset_path: lukaemon/bbh
 output_type: generate_until
 test_split: test
@@ -12,5 +12,7 @@ metric_list:
 generation_kwargs:
  until:
    - "</s>"
+    - "Q"
+    - "\n\n"
  do_sample: false
  temperature: 0.0
--- a/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
 "doc_to_text": "Q: not ( ( not not True ) ) is\nA: False\n\nQ: True and False and not True and True is\nA: False\n\nQ: not not ( not ( False ) ) is\nA: True\n\nQ: {{input}}\nA:"
-"include": "_flan_fewshot_template_yaml"
-"task": "bbh_flan_fewshot_boolean_expressions"
+"include": "_fewshot_template_yaml"
+"task": "bbh_fewshot_boolean_expressions"