Unverified Commit 8ad598df authored by Jungwhan Kim's avatar Jungwhan Kim Committed by GitHub
Browse files

keep new line for task description (#2116)



* add keep trailing newline

* apply ruff-format

* add prompt unit test

* increment the version of tasks that have description with whitespace

* remove white spaces of leaderboard bbh

* update MMLU expected versions in output

* CI run does display the expected version=1 for mmlu subtasks, fix expected test output again

---------
Co-authored-by: default avatarhaileyschoelkopf <hailey@eleuther.ai>
parent 0571eeb1
......@@ -23,4 +23,4 @@ aggregate_metric_list:
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
version: 1.0
......@@ -15,4 +15,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
version: 1.0
......@@ -13,4 +13,4 @@ aggregate_metric_list:
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
version: 1.0
......@@ -15,4 +15,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
version: 1.0
......@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true
filter_list: get-answer
metadata:
version: 2.0
version: 3.0
......@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true
filter_list: get-answer
metadata:
version: 2.0
version: 3.0
......@@ -24,4 +24,4 @@ filter_list:
- function: "take_first"
num_fewshot: 3
metadata:
version: 2.0
version: 3.0
......@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true
filter_list: flexible-extract
metadata:
version: 2.0
version: 3.0
......@@ -23,4 +23,4 @@ generation_kwargs:
temperature: 0.0
num_fewshot: 0
metadata:
version: 2.0
version: 3.0
......@@ -32,4 +32,4 @@ aggregate_metric_list:
aggregation: mean
weight_by_size: true
metadata:
version: 2.0
version: 3.0
......@@ -17,4 +17,4 @@ generation_kwargs:
temperature: 0.0
num_fewshot: 3
metadata:
version: 1.0
version: 2.0
......@@ -33,4 +33,4 @@ aggregate_metric_list:
weight_by_size: true
filter_list: flexible-extract
metadata:
version: 2.0
version: 3.0
......@@ -23,4 +23,4 @@ generation_kwargs:
temperature: 0.0
num_fewshot: 0
metadata:
version: 2.0
version: 3.0
......@@ -7,7 +7,7 @@ aggregate_metric_list:
weight_by_size: true
group: ceval-valid
metadata:
version: 1.0
version: 2.0
task:
- ceval-valid_computer_network
- ceval-valid_operating_system
......
......@@ -15,4 +15,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
......@@ -75,4 +75,4 @@ aggregate_metric_list:
metric: acc_norm
weight_by_size: true
metadata:
version: 0.0
version: 1.0
......@@ -15,4 +15,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
version: 1.0
......@@ -35,4 +35,4 @@ metric_list:
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
version: 2.0
......@@ -36,4 +36,4 @@ metric_list:
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
version: 2.0
......@@ -18,4 +18,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment