keep new line for task description (#2116)

* add keep trailing newline * apply ruff-format * add prompt unit test * increment the version of tasks that have description with whitespace * remove white spaces of leaderboard bbh * update MMLU expected versions in output * CI run does display the expected version=1 for mmlu subtasks, fix expected test output again --------- Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>

keep new line for task description (#2116)
* add keep trailing newline * apply ruff-format * add prompt unit test * increment the version of tasks that have description with whitespace * remove white spaces of leaderboard bbh * update MMLU expected versions in output * CI run does display the expected version=1 for mmlu subtasks, fix expected test output again --------- Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
8ad598df · Jungwhan Kim · GitHub · 0571eeb1 · 8ad598df · 8ad598df
Unverified Commit 8ad598df authored Aug 10, 2024 by Jungwhan Kim Committed by GitHub Aug 09, 2024
20 changed files
--- a/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml
 dataset_name: temporal_sequences
 description: 'Task description: Answer questions about which times certain events
-  could have occurred.
+  could have occurred.'
-        '
 doc_to_choice: ["(A)","(B)","(C)","(D)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml
 dataset_name: tracking_shuffled_objects_five_objects
 description: 'A task requiring determining the final positions of a set of objects
-  given their initial positions and a description of a sequence of swaps.
+  given their initial positions and a description of a sequence of swaps.'
-        '
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml
 dataset_name: tracking_shuffled_objects_seven_objects
 description: 'A task requiring determining the final positions of a set of objects
-  given their initial positions and a description of a sequence of swaps.
+  given their initial positions and a description of a sequence of swaps.'
-        '
 doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml
 dataset_name: tracking_shuffled_objects_three_objects
 description: 'A task requiring determining the final positions of a set of objects
-  given their initial positions and a description of a sequence of swaps.
+  given their initial positions and a description of a sequence of swaps.'
-        '
 doc_to_choice: ["(A)","(B)","(C)"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
+++ b/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml
 dataset_name: web_of_lies
-description: 'Evaluate a random boolean function expressed as a word problem.
+description: 'Evaluate a random boolean function expressed as a word problem.'
-        '
 doc_to_choice: ["Yes","No"]
 fewshot_config:
  sampler: first_n

--- a/lm_eval/tasks/med_concepts_qa/_default_template_yaml
+++ b/lm_eval/tasks/med_concepts_qa/_default_template_yaml
@@ -13,3 +13,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
+++ b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
@@ -8,6 +8,6 @@ doc_to_text: "Question: {{question.strip()}}\nAnswer:"
 doc_to_choice: "{{choices}}"
 doc_to_target: "{{answer}}"
 metadata:
-  version: 0.0
+  version: 1.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/continuation/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/continuation/_mmlu.yaml
@@ -29,4 +29,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -12,6 +12,6 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/default/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/default/_mmlu.yaml
@@ -8,4 +8,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
+++ b/lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/default/_mmlu_other.yaml
+++ b/lm_eval/tasks/mmlu/default/_mmlu_other.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
+++ b/lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
+++ b/lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
@@ -29,4 +29,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -26,6 +26,6 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 1.0
+  version: 2.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
@@ -29,4 +29,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -33,6 +33,6 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 2.0
+  version: 3.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
@@ -29,4 +29,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 1
+  version: 2
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -29,6 +29,6 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 2.0
+  version: 3.0
 dataset_kwargs:
  trust_remote_code: true