Merge remote-tracking branch 'origin/big-refactor' into calibration

e1ae8a2f · Herbie Bradley · 50e99bd7 · 30936bc7 · e1ae8a2f · e1ae8a2f
Commit e1ae8a2f authored Nov 26, 2023 by Herbie Bradley
20 changed files
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/navigate.yaml
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_navigate"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/object_counting.yaml
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_object_counting"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/penguins_in_a_table.yaml
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_penguins_in_a_table"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/reasoning_about_colored_objects.yaml
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_reasoning_about_colored_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/ruin_names.yaml
+"dataset_name": "ruin_names"
+"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_ruin_names"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/salient_translation_error_detection.yaml
+"dataset_name": "salient_translation_error_detection"
+"description": "Detect the type of error in an English translation of a German source sentence.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_salient_translation_error_detection"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/snarks.yaml
+"dataset_name": "snarks"
+"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_snarks"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/sports_understanding.yaml
+"dataset_name": "sports_understanding"
+"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_sports_understanding"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/temporal_sequences.yaml
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_temporal_sequences"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_five_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_seven_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_tracking_shuffled_objects_three_objects"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/web_of_lies.yaml
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_web_of_lies"
--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/word_sorting.yaml
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_zeroshot_template_yaml"
+"task": "bbh_flan_cot_zeroshot_word_sorting"
--- a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+group: bbh_flan_fewshot
+dataset_path: lukaemon/bbh
+output_type: generate_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    # ignore_case: true
+    # ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
--- a/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/boolean_expressions.yaml
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": "Q: not ( ( not not True ) ) is\nA: False\n\nQ: True and False and not True and True is\nA: False\n\nQ: not not ( not ( False ) ) is\nA: True\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_boolean_expressions"
--- a/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/causal_judgement.yaml
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": "Q: How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: No\n\nQ: How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Yes\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_causal_judgement"
--- a/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/date_understanding.yaml
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": "Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\nA: (D)\n\nQ: Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018\nA: (B)\n\nQ: Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\nA: (B)\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_date_understanding"
--- a/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/disambiguation_qa.yaml
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": "Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The chief told the counselor that they took the day off.\nOptions:\n(A) The chief took the day off\n(B) The counselor took the day off\n(C) Ambiguous\nA: (A)\n\nQ: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The manager sent a message to the secretary, but he didn't reply yet.\nOptions:\n(A) The secretary didn't reply yet\n(B) The manager didn't reply yet\n(C) Ambiguous\nA: (A)\n\nQ: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: Bailey will plan to meet the director at his office\nOptions:\n(A) It will be Bailey's office\n(B) It will be the director's office\n(C) Ambiguous\nA: (C)\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_disambiguation_qa"
--- a/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/dyck_languages.yaml
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [\nA: ] } ]\n\nQ: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]\nA: ] ) )\n\nQ: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }\nA: } ] >\n\nQ: {{input}}\nA:"
+"include": "_flan_fewshot_template_yaml"
+"task": "bbh_flan_fewshot_dyck_languages"