Unverified Commit 8ad598df authored by Jungwhan Kim's avatar Jungwhan Kim Committed by GitHub
Browse files

keep new line for task description (#2116)



* add keep trailing newline

* apply ruff-format

* add prompt unit test

* increment the version of tasks that have description with whitespace

* remove white spaces of leaderboard bbh

* update MMLU expected versions in output

* CI run does display the expected version=1 for mmlu subtasks, fix expected test output again

---------
Co-authored-by: default avatarhaileyschoelkopf <hailey@eleuther.ai>
parent 0571eeb1
......@@ -13,4 +13,4 @@ num_fewshot: 3
fewshot_config:
sampler: first_n
metadata:
version: 0.0
version: 1.0
dataset_name: boolean_expressions
description: 'Evaluate the result of a random Boolean expression.
'
description: 'Evaluate the result of a random Boolean expression.'
doc_to_choice: ["False", "True"]
fewshot_config:
sampler: first_n
......
dataset_name: causal_judgement
description: 'Answer questions about causal attribution.
'
description: 'Answer questions about causal attribution.'
doc_to_choice: ["Yes", "No"]
fewshot_config:
sampler: first_n
......
dataset_name: date_understanding
description: 'Infer the date from context.
'
description: 'Infer the date from context.'
doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"]
fewshot_config:
sampler: first_n
......
dataset_name: disambiguation_qa
description: 'Clarify the meaning of sentences with ambiguous pronouns.
'
description: 'Clarify the meaning of sentences with ambiguous pronouns.'
doc_to_choice: ["(A)", "(B)", "(C)"]
fewshot_config:
sampler: first_n
......
dataset_name: formal_fallacies
description: 'Distinguish deductively valid arguments from formal fallacies.
'
description: 'Distinguish deductively valid arguments from formal fallacies.'
doc_to_choice: ["valid", "invalid"]
fewshot_config:
sampler: first_n
......
dataset_name: geometric_shapes
description: 'Name geometric shapes from their SVG paths.
'
description: 'Name geometric shapes from their SVG paths.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)"]
fewshot_config:
sampler: first_n
......
dataset_name: hyperbaton
description: 'Order adjectives correctly in English sentences.
'
description: 'Order adjectives correctly in English sentences.'
doc_to_choice: ["(A)", "(B)"]
fewshot_config:
sampler: first_n
......
dataset_name: logical_deduction_five_objects
description: 'A logical deduction task which requires deducing the order of a sequence
of objects.
'
of objects.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
fewshot_config:
sampler: first_n
......
dataset_name: logical_deduction_seven_objects
description: 'A logical deduction task which requires deducing the order of a sequence
of objects.
'
of objects.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
fewshot_config:
sampler: first_n
......
dataset_name: logical_deduction_three_objects
description: 'A logical deduction task which requires deducing the order of a sequence
of objects.
'
of objects.'
doc_to_choice: ["(A)","(B)","(C)"]
fewshot_config:
sampler: first_n
......
dataset_name: movie_recommendation
description: 'Recommend movies similar to the given list of movies.
'
description: 'Recommend movies similar to the given list of movies.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
fewshot_config:
sampler: first_n
......
dataset_name: navigate
description: 'Given a series of navigation instructions, determine whether one would
end up back at the starting point.
'
end up back at the starting point.'
doc_to_choice: ["Yes","No"]
fewshot_config:
sampler: first_n
......
dataset_name: object_counting
description: 'Questions that involve enumerating objects and asking the model to count
them.
'
them.'
doc_to_choice: ["0","1","2","3","4","5","6","7","8","9","10", "11", "12", "13", "14", "15", "16", "17", "18"]
fewshot_config:
sampler: first_n
......
dataset_name: penguins_in_a_table
description: 'Answer questions about a table of penguins and their attributes.
'
description: 'Answer questions about a table of penguins and their attributes.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
fewshot_config:
sampler: first_n
......
dataset_name: reasoning_about_colored_objects
description: 'Answer extremely simple questions about the colors of objects on a surface.
'
description: 'Answer extremely simple questions about the colors of objects on a surface.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"]
fewshot_config:
sampler: first_n
......
dataset_name: ruin_names
description: 'Select the humorous edit that ''ruins'' the input movie or musical artist
name.
'
name.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
fewshot_config:
sampler: first_n
......
dataset_name: salient_translation_error_detection
description: 'Detect the type of error in an English translation of a German source
sentence.
'
sentence.'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"]
fewshot_config:
sampler: first_n
......
......@@ -5,10 +5,7 @@ description: 'Determine which of two sentences is sarcastic.
According to Cambridge University Dictionary, sarcasm is "the use of remarks that
clearly mean the opposite of what they say, made in order to hurt someone''s feelings
or to criticize something in a humorous way." Sarcastic sentences often contain
satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
'
satirical or ironic utterances, hyperboles, ambivalent or witty remarks.'
doc_to_choice: ["(A)","(B)"]
fewshot_config:
sampler: first_n
......
dataset_name: sports_understanding
description: 'Determine whether an artificially constructed sentence relating to sports
is plausible or not.
'
is plausible or not.'
doc_to_choice: ["yes","no"]
fewshot_config:
sampler: first_n
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment