Merge branch 'main' into group-agg-rework

3d1b8f43 · Lintang Sutawika · GitHub · e200c24e · d855d0ba · 3d1b8f43
Unverified Commit 3d1b8f43 authored Jul 03, 2024 by Lintang Sutawika Committed by GitHub Jul 03, 2024
20 changed files
--- a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
+dataset_name: logical_deduction_seven_objects
+description: 'A logical deduction task which requires deducing the order of a sequence
+  of objects.
+
+  '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished
+      above Amy. Eli finished below Amy.
+
+      Options:
+
+      (A) Amy finished last
+
+      (B) Eli finished last
+
+      (C) Eve finished last'
+    target: (B)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a white book, a green book, and an orange
+      book. The green book is to the right of the white book. The orange book is the
+      rightmost.
+
+      Options:
+
+      (A) The white book is the leftmost
+
+      (B) The green book is the leftmost
+
+      (C) The orange book is the leftmost'
+    target: (A)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a red book, a gray book, and a white book.
+      The white book is to the left of the gray book. The red book is the second from
+      the left.
+
+      Options:
+
+      (A) The red book is the leftmost
+
+      (B) The gray book is the leftmost
+
+      (C) The white book is the leftmost'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_logical_deduction_seven_objects
--- a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
+dataset_name: logical_deduction_three_objects
+description: 'A logical deduction task which requires deducing the order of a sequence
+  of objects.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished
+      above Amy. Eli finished below Amy.
+
+      Options:
+
+      (A) Amy finished last
+
+      (B) Eli finished last
+
+      (C) Eve finished last'
+    target: (B)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a white book, a green book, and an orange
+      book. The green book is to the right of the white book. The orange book is the
+      rightmost.
+
+      Options:
+
+      (A) The white book is the leftmost
+
+      (B) The green book is the leftmost
+
+      (C) The orange book is the leftmost'
+    target: (A)
+  - input: 'The following paragraphs each describe a set of three objects arranged
+      in a fixed order. The statements are logically consistent within each paragraph.
+      On a shelf, there are three books: a red book, a gray book, and a white book.
+      The white book is to the left of the gray book. The red book is the second from
+      the left.
+
+      Options:
+
+      (A) The red book is the leftmost
+
+      (B) The gray book is the leftmost
+
+      (C) The white book is the leftmost'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_logical_deduction_three_objects
--- a/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
+dataset_name: movie_recommendation
+description: 'Recommend movies similar to the given list of movies.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones
+      and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big
+      Lebowski:
+
+      Options:
+
+      (A) Tetsuo
+
+      (B) the Ironman
+
+      (C) The Princess Bride
+
+      (D) The Barkley Marathons The Race That Eats Its Young
+
+      (E) Bug'
+    target: (C)
+  - input: 'Find a movie similar to Twister, The Silence of the Lambs, Independence
+      Day, Braveheart:
+
+      Options:
+
+      (A) They Shoot Horses
+
+      (B) Don''t They
+
+      (C) Forrest Gump
+
+      (D) The Salton Sea
+
+      (E) Extreme Days'
+    target: (C)
+  - input: 'Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest
+      Gump:
+
+      Options:
+
+      (A) Phenomena
+
+      (B) Lilting
+
+      (C) Catwoman
+
+      (D) Edge of Tomorrow'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_movie_recommendation
--- a/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
+dataset_name: navigate
+description: 'Given a series of navigation instructions, determine whether one would
+  end up back at the starting point.
+
+        '
+doc_to_choice: ["Yes","No"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'If you follow these instructions, do you return to the starting point?
+      Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps.
+      Take 8 steps.
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'No'
+  - input: 'If you follow these instructions, do you return to the starting point?
+      Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps.
+      Take 1 step.
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'No'
+  - input: 'If you follow these instructions, do you return to the starting point?
+      Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.
+
+      Options:
+
+      - Yes
+
+      - No'
+    target: 'Yes'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_navigate
--- a/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
+dataset_name: object_counting
+description: 'Questions that involve enumerating objects and asking the model to count
+  them.
+
+        '
+doc_to_choice: ["0","1","2","3","4","5","6","7","8","9","10", "11", "12", "13", "14", "15", "16", "17", "18"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana,
+      a flute, an orange, and a violin. How many fruits do I have?
+    target: '6'
+  - input: I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape,
+      a nectarine, and three plums. How many fruits do I have?
+    target: '11'
+  - input: I have a lettuce head, a head of broccoli, an onion, a stalk of celery,
+      two carrots, a garlic, and a yam. How many vegetables do I have?
+    target: '8'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_object_counting
--- a/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
+dataset_name: penguins_in_a_table
+description: 'Answer questions about a table of penguins and their attributes.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Here is a table where the first line is a header and each subsequent line
+      is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard,
+      5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis
+      is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add
+      a penguin to the table:
+
+      James, 12, 90, 12
+
+      How many penguins are less than 8 years old?
+
+      Options:
+
+      (A) 1
+
+      (B) 2
+
+      (C) 3
+
+      (D) 4
+
+      (E) 5'
+    target: (B)
+  - input: 'Here is a table where the first line is a header and each subsequent line
+      is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard,
+      5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis
+      is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is
+      the youngest penguin?
+
+      Options:
+
+      (A) Louis
+
+      (B) Bernard
+
+      (C) Vincent
+
+      (D) Gwen
+
+      (E) James'
+    target: (B)
+  - input: 'Here is a table where the first line is a header and each subsequent line
+      is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard,
+      5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis
+      is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is
+      the name of the second penguin sorted by alphabetic order?
+
+      Options:
+
+      (A) Louis
+
+      (B) Bernard
+
+      (C) Vincent
+
+      (D) Gwen
+
+      (E) James'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_penguins_in_a_table
--- a/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
+dataset_name: reasoning_about_colored_objects
+description: 'Answer extremely simple questions about the colors of objects on a surface.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'On the nightstand, there is a red pencil, a purple mug, a burgundy keychain,
+      a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the
+      stress ball?
+
+      Options:
+
+      (A) red
+
+      (B) orange
+
+      (C) yellow
+
+      (D) green
+
+      (E) blue
+
+      (F) brown
+
+      (G) magenta
+
+      (H) fuchsia
+
+      (I) mauve
+
+      (J) teal
+
+      (K) turquoise
+
+      (L) burgundy
+
+      (M) silver
+
+      (N) gold
+
+      (O) black
+
+      (P) grey
+
+      (Q) purple
+
+      (R) pink'
+    target: (E)
+  - input: 'On the table, you see a bunch of objects arranged in a row: a purple paperclip,
+      a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve
+      fidget spinner, and a burgundy pen. What is the color of the object directly
+      to the right of the stress ball?
+
+      Options:
+
+      (A) red
+
+      (B) orange
+
+      (C) yellow
+
+      (D) green
+
+      (E) blue
+
+      (F) brown
+
+      (G) magenta
+
+      (H) fuchsia
+
+      (I) mauve
+
+      (J) teal
+
+      (K) turquoise
+
+      (L) burgundy
+
+      (M) silver
+
+      (N) gold
+
+      (O) black
+
+      (P) grey
+
+      (Q) purple
+
+      (R) pink'
+    target: (F)
+  - input: 'On the nightstand, you see the following items arranged in a row: a teal
+      plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug,
+      a pink notebook, and a grey cup. How many non-orange items do you see to the
+      left of the teal item?
+
+      Options:
+
+      (A) zero
+
+      (B) one
+
+      (C) two
+
+      (D) three
+
+      (E) four
+
+      (F) five
+
+      (G) six'
+    target: (A)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_reasoning_about_colored_objects
--- a/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
+dataset_name: ruin_names
+description: 'Select the humorous edit that ''ruins'' the input movie or musical artist
+  name.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Which of the following is a humorous edit of this artist or movie name:
+      ''whitesnake''?
+
+      Options:
+
+      (A) whitesnape
+
+      (B) whitesnapke
+
+      (C) whitesnuake
+
+      (D) mwhitesnake'
+    target: (A)
+  - input: 'Which of the following is a humorous edit of this artist or movie name:
+      ''one of our dinosaurs is missing''?
+
+      Options:
+
+      (A) ofne of our dinosaurs is missing
+
+      (B) one af our dinosaurs is missing
+
+      (C) one of our dinosaurs is pissing
+
+      (D) one of our dinosaur is missing'
+    target: (C)
+  - input: 'Which of the following is a humorous edit of this artist or movie name:
+      ''counting crows''?
+
+      Options:
+
+      (A) countingy crows
+
+      (B) counting cows
+
+      (C) courting crows
+
+      (D) coutnting crows'
+    target: (B)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_ruin_names
--- a/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
+dataset_name: salient_translation_error_detection
+description: 'Detect the type of error in an English translation of a German source
+  sentence.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: "The following translations from German to English contain a particular
+      error. That error will be one of the following types: Named Entities: An entity
+      (names, places, locations, etc.) is changed to a different entity. Numerical
+      Values: Numerical values (ordinals or cardinals), dates, and/or units are changed.
+      Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are
+      changed. Negation or Antonyms: Introduce or remove a negation or change comparatives
+      to their antonyms. Facts: Trivial factual errors not pertaining to the above
+      classes are introduced in the translations. Dropped Content: A significant clause
+      in the translation is removed. Please identify that error.  Source: In der Liste
+      der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen
+      Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
+
+      Translation: In the list of architectural monuments in Lenzen all architectural
+      monuments of the Brandenburg city of Lenzen and its districts are listed.
+
+      The translation contains an error pertaining to
+
+      Options:
+
+      (A) Modifiers or Adjectives
+
+      (B) Numerical Values
+
+      (C) Negation or Antonyms
+
+      (D) Named Entities
+
+      (E) Dropped Content
+
+      (F) Facts"
+    target: (D)
+  - input: "The following translations from German to English contain a particular
+      error. That error will be one of the following types: Named Entities: An entity
+      (names, places, locations, etc.) is changed to a different entity. Numerical
+      Values: Numerical values (ordinals or cardinals), dates, and/or units are changed.
+      Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are
+      changed. Negation or Antonyms: Introduce or remove a negation or change comparatives
+      to their antonyms. Facts: Trivial factual errors not pertaining to the above
+      classes are introduced in the translations. Dropped Content: A significant clause
+      in the translation is removed. Please identify that error.  Source: Auf dieser
+      Seite sind die Baudenkm\xE4ler der oberbayerischen Gro\xDFen Kreisstadt Landsberg
+      am Lech zusammengestellt.
+
+      Translation: On this page are compiled the architectural monuments of the town of
+      Landsberg am Lech.
+
+      The translation contains an error pertaining to
+
+      Options:
+
+      (A) Modifiers or Adjectives
+
+      (B) Numerical Values
+
+      (C) Negation or Antonyms
+
+      (D) Named Entities
+
+      (E) Dropped Content
+
+      (F) Facts"
+    target: (E)
+  - input: "The following translations from German to English contain a particular
+      error. That error will be one of the following types: Named Entities: An entity
+      (names, places, locations, etc.) is changed to a different entity. Numerical
+      Values: Numerical values (ordinals or cardinals), dates, and/or units are changed.
+      Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are
+      changed. Negation or Antonyms: Introduce or remove a negation or change comparatives
+      to their antonyms. Facts: Trivial factual errors not pertaining to the above
+      classes are introduced in the translations. Dropped Content: A significant clause
+      in the translation is removed. Please identify that error.  Source: \u0141eba ist
+      eine Kleinstadt und ein Badeort im Powiat L\u0119borski der polnischen Woiwodschaft
+      Pommern.
+
+      Translation: Eba is not a small town and seaside resort in the Powiat L\xE9borski
+      county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error
+      pertaining to
+
+      Options:
+
+      (A) Modifiers or Adjectives
+
+      (B) Numerical Values
+
+      (C) Negation or Antonyms
+
+      (D) Named Entities
+
+      (E) Dropped Content
+
+      (F) Facts"
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_salient_translation_error_detection
--- a/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
+dataset_name: snarks
+description: 'Determine which of two sentences is sarcastic.
+
+
+  According to Cambridge University Dictionary, sarcasm is "the use of remarks that
+  clearly mean the opposite of what they say, made in order to hurt someone''s feelings
+  or to criticize something in a humorous way." Sarcastic sentences often contain
+  satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
+
+
+  '
+doc_to_choice: ["(A)","(B)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Which statement is sarcastic?
+
+      Options:
+
+      (A) Yes, because having interests and actively researching them is a huge waste
+
+      (B) Yes, because having interests and actively researching them is a huge deal'
+    target: (A)
+  - input: 'Which statement is sarcastic?
+
+      Options:
+
+      (A) No one is going to disagree with you on this. Avoiding ad hominem attacks
+      really help your case
+
+      (B) No one is going to disagree with you on this. Ad hominem attacks really
+      help your case'
+    target: (B)
+  - input: 'Which statement is sarcastic?
+
+      Options:
+
+      (A) Consistency in the league''s punishments? What do you think this is supposed
+      to be, politics?
+
+      (B) Consistency in the league''s punishments? What do you think this is supposed
+      to be, moral?'
+    target: (A)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_snarks
--- a/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
+dataset_name: sports_understanding
+description: 'Determine whether an artificially constructed sentence relating to sports
+  is plausible or not.
+
+        '
+doc_to_choice: ["yes","no"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: Is the following sentence plausible? "Bam Adebayo scored a reverse layup
+      in the Western Conference Finals."
+    target: 'yes'
+  - input: Is the following sentence plausible? "Santi Cazorla scored a touchdown."
+    target: 'no'
+  - input: Is the following sentence plausible? "DeMar DeRozan was called for the
+      goal tend."
+    target: 'yes'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_sports_understanding
--- a/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
+dataset_name: temporal_sequences
+description: 'Task description: Answer questions about which times certain events
+  could have occurred.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Today, Emily went to the museum. Between what times could they have gone?
+
+      We know that:
+
+      Emily woke up at 1pm.
+
+      Elizabeth saw Emily reading at the library from 2pm to 4pm.
+
+      Jessica saw Emily watching a movie at the theater from 4pm to 5pm.
+
+      Leslie saw Emily waiting at the airport from 5pm to 6pm.
+
+      William saw Emily buying clothes at the mall from 6pm to 7pm.
+
+      The museum was closed after 7pm.
+
+      Between what times could Emily have gone to the museum?
+
+      Options:
+
+      (A) 1pm to 2pm
+
+      (B) 6pm to 7pm
+
+      (C) 5pm to 6pm
+
+      (D) 2pm to 4pm'
+    target: (A)
+  - input: 'Today, Elizabeth went to the amusement park. Between what times could
+      they have gone?
+
+      We know that:
+
+      Elizabeth woke up at 7am.
+
+      David saw Elizabeth fixing their computer at the electronic store from 1pm to
+      2pm.
+
+      Sarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm.
+
+      Susan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm.
+
+      Andrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm.
+
+      Emily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm.
+
+      The amusement park was closed after 10pm.
+
+      Between what times could Elizabeth have gone to the amusement park?
+
+      Options:
+
+      (A) 7am to 1pm
+
+      (B) 9pm to 10pm
+
+      (C) 1pm to 2pm
+
+      (D) 3pm to 6pm'
+    target: (A)
+  - input: 'Today, Tiffany went to the beach. Between what times could they have gone?
+
+      We know that:
+
+      Tiffany woke up at 5am.
+
+      Betty saw Tiffany getting a coffee at the cafe from 5am to 6am.
+
+      Jessica saw Tiffany working at the office from 6am to 9am.
+
+      John saw Tiffany stretching at a yoga studio from 9am to 12pm.
+
+      Sean saw Tiffany sitting on a rooftop from 12pm to 2pm.
+
+      Sarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm.
+
+      The beach was closed after 4pm.
+
+      Between what times could Tiffany have gone to the beach?
+
+      Options:
+
+      (A) 9am to 12pm
+
+      (B) 12pm to 2pm
+
+      (C) 5am to 6am
+
+      (D) 3pm to 4pm'
+    target: (D)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_temporal_sequences
--- a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
+dataset_name: tracking_shuffled_objects_five_objects
+description: 'A task requiring determining the final positions of a set of objects
+  given their initial positions and a description of a sequence of swaps.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire
+      has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Claire and Alice
+      swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls.
+      At the end of the game, Bob has the
+
+      Options:
+
+      (A) yellow ball
+
+      (B) blue ball
+
+      (C) pink ball'
+    target: (A)
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a white ball, Bob has a purple ball, and
+      Claire has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Bob and Alice swap
+      balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At
+      the end of the game, Alice has the
+
+      Options:
+
+      (A) white ball
+
+      (B) purple ball
+
+      (C) pink ball'
+    target: (C)
+  - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of
+      a song, they each have a partner: Alice is dancing with Lola, Bob is dancing
+      with Rodrigo, and Claire is dancing with Patrick.
+
+      Throughout the song, the dancers often trade partners. First, Alice and Bob
+      switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice
+      switch partners. At the end of the dance, Alice is dancing with
+
+      Options:
+
+      (A) Lola
+
+      (B) Rodrigo
+
+      (C) Patrick'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_tracking_shuffled_objects_five_objects
--- a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
+dataset_name: tracking_shuffled_objects_seven_objects
+description: 'A task requiring determining the final positions of a set of objects
+  given their initial positions and a description of a sequence of swaps.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire
+      has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Claire and Alice
+      swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls.
+      At the end of the game, Bob has the
+
+      Options:
+
+      (A) yellow ball
+
+      (B) blue ball
+
+      (C) pink ball'
+    target: (A)
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a white ball, Bob has a purple ball, and
+      Claire has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Bob and Alice swap
+      balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At
+      the end of the game, Alice has the
+
+      Options:
+
+      (A) white ball
+
+      (B) purple ball
+
+      (C) pink ball'
+    target: (C)
+  - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of
+      a song, they each have a partner: Alice is dancing with Lola, Bob is dancing
+      with Rodrigo, and Claire is dancing with Patrick.
+
+      Throughout the song, the dancers often trade partners. First, Alice and Bob
+      switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice
+      switch partners. At the end of the dance, Alice is dancing with
+
+      Options:
+
+      (A) Lola
+
+      (B) Rodrigo
+
+      (C) Patrick'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_tracking_shuffled_objects_seven_objects
--- a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
+dataset_name: tracking_shuffled_objects_three_objects
+description: 'A task requiring determining the final positions of a set of objects
+  given their initial positions and a description of a sequence of swaps.
+
+        '
+doc_to_choice: ["(A)","(B)","(C)"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire
+      has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Claire and Alice
+      swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls.
+      At the end of the game, Bob has the
+
+      Options:
+
+      (A) yellow ball
+
+      (B) blue ball
+
+      (C) pink ball'
+    target: (A)
+  - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they
+      are each holding a ball: Alice has a white ball, Bob has a purple ball, and
+      Claire has a pink ball.
+
+      As the game progresses, pairs of players trade balls. First, Bob and Alice swap
+      balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At
+      the end of the game, Alice has the
+
+      Options:
+
+      (A) white ball
+
+      (B) purple ball
+
+      (C) pink ball'
+    target: (C)
+  - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of
+      a song, they each have a partner: Alice is dancing with Lola, Bob is dancing
+      with Rodrigo, and Claire is dancing with Patrick.
+
+      Throughout the song, the dancers often trade partners. First, Alice and Bob
+      switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice
+      switch partners. At the end of the dance, Alice is dancing with
+
+      Options:
+
+      (A) Lola
+
+      (B) Rodrigo
+
+      (C) Patrick'
+    target: (C)
+include: _fewshot_template_yaml
+task: leaderboard_bbh_tracking_shuffled_objects_three_objects
--- a/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
+dataset_name: web_of_lies
+description: 'Evaluate a random boolean function expressed as a word problem.
+
+        '
+doc_to_choice: ["Yes","No"]
+fewshot_config:
+  sampler: first_n
+  samples:
+  - input: 'Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina
+      says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent
+      lies. Does Raymond tell the truth?'
+    target: 'Yes'
+  - input: 'Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie
+      tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell
+      the truth?'
+    target: 'Yes'
+  - input: 'Question: Kristian tells the truth. Michaela says Kristian lies. Raymond
+      says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says
+      Osvaldo tells the truth. Does Jamey tell the truth?'
+    target: 'No'
+include: _fewshot_template_yaml
+task: leaderboard_bbh_web_of_lies
--- a/lm_eval/tasks/leaderboard/gpqa/_template_yaml
+++ b/lm_eval/tasks/leaderboard/gpqa/_template_yaml
+dataset_path: Idavidrein/gpqa
+group: leaderboard_gpqa
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+training_split: train
+# Because huggingface dataset only has train split
+validation_split: train
+test_split: null
+doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer: "
+doc_to_target: answer
+doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
+num_fewshot: 0
+metric_list:
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+fewshot_config:
+  sampler: first_n
--- a/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml
+++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_diamond
+include: _template_yaml
+task: leaderboard_gpqa_diamond
--- a/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml
+++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_extended
+include: _template_yaml
+task: leaderboard_gpqa_extended
--- a/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml
+++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_main
+include: _template_yaml
+task: leaderboard_gpqa_main