Unverified Commit 8ad598df authored by Jungwhan Kim's avatar Jungwhan Kim Committed by GitHub
Browse files

keep new line for task description (#2116)



* add keep trailing newline

* apply ruff-format

* add prompt unit test

* increment the version of tasks that have description with whitespace

* remove white spaces of leaderboard bbh

* update MMLU expected versions in output

* CI run does display the expected version=1 for mmlu subtasks, fix expected test output again

---------
Co-authored-by: default avatarhaileyschoelkopf <hailey@eleuther.ai>
parent 0571eeb1
...@@ -23,4 +23,4 @@ aggregate_metric_list: ...@@ -23,4 +23,4 @@ aggregate_metric_list:
aggregation: mean aggregation: mean
weight_by_size: true weight_by_size: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -15,4 +15,4 @@ metric_list: ...@@ -15,4 +15,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -13,4 +13,4 @@ aggregate_metric_list: ...@@ -13,4 +13,4 @@ aggregate_metric_list:
aggregation: mean aggregation: mean
weight_by_size: true weight_by_size: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -15,4 +15,4 @@ metric_list: ...@@ -15,4 +15,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -33,4 +33,4 @@ aggregate_metric_list: ...@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true weight_by_size: true
filter_list: get-answer filter_list: get-answer
metadata: metadata:
version: 2.0 version: 3.0
...@@ -33,4 +33,4 @@ aggregate_metric_list: ...@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true weight_by_size: true
filter_list: get-answer filter_list: get-answer
metadata: metadata:
version: 2.0 version: 3.0
...@@ -24,4 +24,4 @@ filter_list: ...@@ -24,4 +24,4 @@ filter_list:
- function: "take_first" - function: "take_first"
num_fewshot: 3 num_fewshot: 3
metadata: metadata:
version: 2.0 version: 3.0
...@@ -33,4 +33,4 @@ aggregate_metric_list: ...@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true weight_by_size: true
filter_list: flexible-extract filter_list: flexible-extract
metadata: metadata:
version: 2.0 version: 3.0
...@@ -23,4 +23,4 @@ generation_kwargs: ...@@ -23,4 +23,4 @@ generation_kwargs:
temperature: 0.0 temperature: 0.0
num_fewshot: 0 num_fewshot: 0
metadata: metadata:
version: 2.0 version: 3.0
...@@ -32,4 +32,4 @@ aggregate_metric_list: ...@@ -32,4 +32,4 @@ aggregate_metric_list:
aggregation: mean aggregation: mean
weight_by_size: true weight_by_size: true
metadata: metadata:
version: 2.0 version: 3.0
...@@ -17,4 +17,4 @@ generation_kwargs: ...@@ -17,4 +17,4 @@ generation_kwargs:
temperature: 0.0 temperature: 0.0
num_fewshot: 3 num_fewshot: 3
metadata: metadata:
version: 1.0 version: 2.0
...@@ -33,4 +33,4 @@ aggregate_metric_list: ...@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true weight_by_size: true
filter_list: flexible-extract filter_list: flexible-extract
metadata: metadata:
version: 2.0 version: 3.0
...@@ -23,4 +23,4 @@ generation_kwargs: ...@@ -23,4 +23,4 @@ generation_kwargs:
temperature: 0.0 temperature: 0.0
num_fewshot: 0 num_fewshot: 0
metadata: metadata:
version: 2.0 version: 3.0
...@@ -7,7 +7,7 @@ aggregate_metric_list: ...@@ -7,7 +7,7 @@ aggregate_metric_list:
weight_by_size: true weight_by_size: true
group: ceval-valid group: ceval-valid
metadata: metadata:
version: 1.0 version: 2.0
task: task:
- ceval-valid_computer_network - ceval-valid_computer_network
- ceval-valid_operating_system - ceval-valid_operating_system
......
...@@ -15,4 +15,4 @@ metric_list: ...@@ -15,4 +15,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 2.0
...@@ -75,4 +75,4 @@ aggregate_metric_list: ...@@ -75,4 +75,4 @@ aggregate_metric_list:
metric: acc_norm metric: acc_norm
weight_by_size: true weight_by_size: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -15,4 +15,4 @@ metric_list: ...@@ -15,4 +15,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -35,4 +35,4 @@ metric_list: ...@@ -35,4 +35,4 @@ metric_list:
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata: metadata:
version: 1.0 version: 2.0
...@@ -36,4 +36,4 @@ metric_list: ...@@ -36,4 +36,4 @@ metric_list:
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata: metadata:
version: 1.0 version: 2.0
...@@ -18,4 +18,4 @@ metric_list: ...@@ -18,4 +18,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 2.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment