Merge remote-tracking branch 'origin/big-refactor' into calibration

e1ae8a2f · Herbie Bradley · 50e99bd7 · 30936bc7 · e1ae8a2f · e1ae8a2f
Commit e1ae8a2f authored Nov 26, 2023 by Herbie Bradley
20 changed files
--- a/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/web_of_lies.yaml
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?\nA: Yes\n\nQ: Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell the truth?\nA: Yes\n\nQ: Question: Kristian tells the truth. Michaela says Kristian lies. Raymond says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says Osvaldo tells the truth. Does Jamey tell the truth?\nA: No\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_web_of_lies"
--- a/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/word_sorting.yaml
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: Sort the following words alphabetically: List: oven costume counterpart\nA: costume counterpart oven\n\nQ: Sort the following words alphabetically: List: hypochlorite ponderosa phone credulity\nA: credulity hypochlorite phone ponderosa\n\nQ: Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census\nA: arson aspect census mugho newt parthia seismography\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_word_sorting"
--- a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+group: bbh_flan_zeroshot
+dataset_path: lukaemon/bbh
+output_type: generate_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    # ignore_case: true
+    # ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
--- a/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/boolean_expressions.yaml
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_boolean_expressions"
--- a/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/causal_judgement.yaml
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_causal_judgement"
--- a/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/date_understanding.yaml
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_date_understanding"
--- a/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/disambiguation_qa.yaml
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_disambiguation_qa"
--- a/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/dyck_languages.yaml
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_dyck_languages"
--- a/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/formal_fallacies.yaml
+"dataset_name": "formal_fallacies"
+"description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_formal_fallacies"
--- a/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/geometric_shapes.yaml
+"dataset_name": "geometric_shapes"
+"description": "Name geometric shapes from their SVG paths.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_geometric_shapes"
--- a/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/hyperbaton.yaml
+"dataset_name": "hyperbaton"
+"description": "Order adjectives correctly in English sentences.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_hyperbaton"
--- a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_five_objects.yaml
+"dataset_name": "logical_deduction_five_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_logical_deduction_five_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_seven_objects.yaml
+"dataset_name": "logical_deduction_seven_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_logical_deduction_seven_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/logical_deduction_three_objects.yaml
+"dataset_name": "logical_deduction_three_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_logical_deduction_three_objects"
--- a/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/movie_recommendation.yaml
+"dataset_name": "movie_recommendation"
+"description": "Recommend movies similar to the given list of movies.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_movie_recommendation"
--- a/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/multistep_arithmetic_two.yaml
+"dataset_name": "multistep_arithmetic_two"
+"description": "Solve multi-step arithmetic problems.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_multistep_arithmetic_two"
--- a/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/navigate.yaml
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_navigate"
--- a/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/object_counting.yaml
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_object_counting"
--- a/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/penguins_in_a_table.yaml
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_penguins_in_a_table"
--- a/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/reasoning_about_colored_objects.yaml
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_flan_zeroshot_template_yaml"
+"task": "bbh_flan_zeroshot_reasoning_about_colored_objects"