Unverified Commit 8ad598df authored by Jungwhan Kim's avatar Jungwhan Kim Committed by GitHub
Browse files

keep new line for task description (#2116)



* add keep trailing newline

* apply ruff-format

* add prompt unit test

* increment the version of tasks that have description with whitespace

* remove white spaces of leaderboard bbh

* update MMLU expected versions in output

* CI run does display the expected version=1 for mmlu subtasks, fix expected test output again

---------
Co-authored-by: default avatarhaileyschoelkopf <hailey@eleuther.ai>
parent 0571eeb1
dataset_name: temporal_sequences dataset_name: temporal_sequences
description: 'Task description: Answer questions about which times certain events description: 'Task description: Answer questions about which times certain events
could have occurred. could have occurred.'
'
doc_to_choice: ["(A)","(B)","(C)","(D)"] doc_to_choice: ["(A)","(B)","(C)","(D)"]
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
......
dataset_name: tracking_shuffled_objects_five_objects dataset_name: tracking_shuffled_objects_five_objects
description: 'A task requiring determining the final positions of a set of objects description: 'A task requiring determining the final positions of a set of objects
given their initial positions and a description of a sequence of swaps. given their initial positions and a description of a sequence of swaps.'
'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"] doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
......
dataset_name: tracking_shuffled_objects_seven_objects dataset_name: tracking_shuffled_objects_seven_objects
description: 'A task requiring determining the final positions of a set of objects description: 'A task requiring determining the final positions of a set of objects
given their initial positions and a description of a sequence of swaps. given their initial positions and a description of a sequence of swaps.'
'
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"] doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
......
dataset_name: tracking_shuffled_objects_three_objects dataset_name: tracking_shuffled_objects_three_objects
description: 'A task requiring determining the final positions of a set of objects description: 'A task requiring determining the final positions of a set of objects
given their initial positions and a description of a sequence of swaps. given their initial positions and a description of a sequence of swaps.'
'
doc_to_choice: ["(A)","(B)","(C)"] doc_to_choice: ["(A)","(B)","(C)"]
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
......
dataset_name: web_of_lies dataset_name: web_of_lies
description: 'Evaluate a random boolean function expressed as a word problem. description: 'Evaluate a random boolean function expressed as a word problem.'
'
doc_to_choice: ["Yes","No"] doc_to_choice: ["Yes","No"]
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
......
...@@ -13,3 +13,5 @@ metric_list: ...@@ -13,3 +13,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
version: 1.0
...@@ -8,6 +8,6 @@ doc_to_text: "Question: {{question.strip()}}\nAnswer:" ...@@ -8,6 +8,6 @@ doc_to_text: "Question: {{question.strip()}}\nAnswer:"
doc_to_choice: "{{choices}}" doc_to_choice: "{{choices}}"
doc_to_target: "{{answer}}" doc_to_target: "{{answer}}"
metadata: metadata:
version: 0.0 version: 1.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -29,4 +29,4 @@ aggregate_metric_list: ...@@ -29,4 +29,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -12,6 +12,6 @@ metric_list: ...@@ -12,6 +12,6 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -8,4 +8,4 @@ aggregate_metric_list: ...@@ -8,4 +8,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -6,4 +6,4 @@ aggregate_metric_list: ...@@ -6,4 +6,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -29,4 +29,4 @@ aggregate_metric_list: ...@@ -29,4 +29,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -26,6 +26,6 @@ metric_list: ...@@ -26,6 +26,6 @@ metric_list:
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata: metadata:
version: 1.0 version: 2.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -29,4 +29,4 @@ aggregate_metric_list: ...@@ -29,4 +29,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -33,6 +33,6 @@ metric_list: ...@@ -33,6 +33,6 @@ metric_list:
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata: metadata:
version: 2.0 version: 3.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -29,4 +29,4 @@ aggregate_metric_list: ...@@ -29,4 +29,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -29,6 +29,6 @@ metric_list: ...@@ -29,6 +29,6 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 2.0 version: 3.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment