keep new line for task description (#2116)

* add keep trailing newline * apply ruff-format * add prompt unit test * increment the version of tasks that have description with whitespace * remove white spaces of leaderboard bbh * update MMLU expected versions in output * CI run does display the expected version=1 for mmlu subtasks, fix expected test output again --------- Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>

keep new line for task description (#2116)
* add keep trailing newline * apply ruff-format * add prompt unit test * increment the version of tasks that have description with whitespace * remove white spaces of leaderboard bbh * update MMLU expected versions in output * CI run does display the expected version=1 for mmlu subtasks, fix expected test output again --------- Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
8ad598df · Jungwhan Kim · GitHub · 0571eeb1 · 8ad598df · 8ad598df
Unverified Commit 8ad598df authored Aug 10, 2024 by Jungwhan Kim Committed by GitHub Aug 09, 2024
20 changed files
--- a/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml
@@ -13,4 +13,4 @@ num_fewshot: 3
 fewshot_config:
  sampler: first_n
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml
 dataset_name: boolean_expressions
-description: 'Evaluate the result of a random Boolean expression.
-
-        '
+description: 'Evaluate the result of a random Boolean expression.'
 doc_to_choice: ["False", "True"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml
 dataset_name: causal_judgement
-description: 'Answer questions about causal attribution.
-
-        '
+description: 'Answer questions about causal attribution.'
 doc_to_choice: ["Yes", "No"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml
 dataset_name: date_understanding
-description: 'Infer the date from context.
-
-        '
+description: 'Infer the date from context.'
 doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml
 dataset_name: disambiguation_qa
-description: 'Clarify the meaning of sentences with ambiguous pronouns.
-
-        '
+description: 'Clarify the meaning of sentences with ambiguous pronouns.'
 doc_to_choice: ["(A)", "(B)", "(C)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml
 dataset_name: formal_fallacies
-description: 'Distinguish deductively valid arguments from formal fallacies.
-
-        '
+description: 'Distinguish deductively valid arguments from formal fallacies.'
 doc_to_choice: ["valid", "invalid"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml
 dataset_name: geometric_shapes
-description: 'Name geometric shapes from their SVG paths.
-
-        '
+description: 'Name geometric shapes from their SVG paths.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml
 dataset_name: hyperbaton
-description: 'Order adjectives correctly in English sentences.
-
-        '
+description: 'Order adjectives correctly in English sentences.'
 doc_to_choice: ["(A)", "(B)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml
 dataset_name: logical_deduction_five_objects
 description: 'A logical deduction task which requires deducing the order of a sequence
-  of objects.
-
-        '
+  of objects.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml
 dataset_name: logical_deduction_seven_objects
 description: 'A logical deduction task which requires deducing the order of a sequence
-  of objects.
-
-  '
+  of objects.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml
 dataset_name: logical_deduction_three_objects
 description: 'A logical deduction task which requires deducing the order of a sequence
-  of objects.
-
-        '
+  of objects.'
 doc_to_choice: ["(A)","(B)","(C)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml
 dataset_name: movie_recommendation
-description: 'Recommend movies similar to the given list of movies.
-
-        '
+description: 'Recommend movies similar to the given list of movies.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml
 dataset_name: navigate
 description: 'Given a series of navigation instructions, determine whether one would
-  end up back at the starting point.
-
-        '
+  end up back at the starting point.'
 doc_to_choice: ["Yes","No"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml
 dataset_name: object_counting
 description: 'Questions that involve enumerating objects and asking the model to count
-  them.
-
-        '
+  them.'
 doc_to_choice: ["0","1","2","3","4","5","6","7","8","9","10", "11", "12", "13", "14", "15", "16", "17", "18"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml
 dataset_name: penguins_in_a_table
-description: 'Answer questions about a table of penguins and their attributes.
-
-        '
+description: 'Answer questions about a table of penguins and their attributes.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
 dataset_name: reasoning_about_colored_objects
-description: 'Answer extremely simple questions about the colors of objects on a surface.
-
-        '
+description: 'Answer extremely simple questions about the colors of objects on a surface.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml
 dataset_name: ruin_names
 description: 'Select the humorous edit that ''ruins'' the input movie or musical artist
-  name.
-
-        '
+  name.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml
 dataset_name: salient_translation_error_detection
 description: 'Detect the type of error in an English translation of a German source
-  sentence.
-
-        '
+  sentence.'
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml
@@ -5,10 +5,7 @@ description: 'Determine which of two sentences is sarcastic.
  According to Cambridge University Dictionary, sarcasm is "the use of remarks that
  clearly mean the opposite of what they say, made in order to hurt someone''s feelings
  or to criticize something in a humorous way." Sarcastic sentences often contain
-  satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
-
-
-  '
+  satirical or ironic utterances, hyperboles, ambivalent or witty remarks.'
 doc_to_choice: ["(A)","(B)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml
 dataset_name: sports_understanding
 description: 'Determine whether an artificially constructed sentence relating to sports
-  is plausible or not.
-
-        '
+  is plausible or not.'
 doc_to_choice: ["yes","no"]
 fewshot_config:
  sampler: first_n