Merge pull request #1031 from EleutherAI/versioning

[Refactor] Versioning

Merge pull request #1031 from EleutherAI/versioning
[Refactor] Versioning
dc5b3d5d · Stella Biderman · GitHub · 39c2bb4e · 52f75f0e · dc5b3d5d
Unverified Commit dc5b3d5d authored Nov 28, 2023 by Stella Biderman Committed by GitHub Nov 28, 2023
20 changed files
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -50,7 +50,7 @@ Scoring details:
 - **doc_to_decontamination_query** (`str`, *optional*) —
 Other:
- **metadata** (`str`, *optional*) — An optional field where arbitrary metadata can be passed.
+- **metadata** (`Union[str, list]`, *optional*) — An optional field where arbitrary metadata can be passed. A good example would be `version` that is used to denote the version of the yaml config.
 ## Filters

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -91,7 +91,9 @@ class TaskConfig(dict):
    should_decontaminate: bool = False
    doc_to_decontamination_query: str = None
-    metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    metadata: Union[
+        str, list
+    ] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
    def __post_init__(self) -> None:
        if self.dataset_path and ("." in self.dataset_path):

--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
@@ -22,3 +22,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -19,3 +19,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -12,3 +12,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -10,3 +10,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -16,3 +16,5 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -24,3 +24,5 @@ filter_list:
      - function: "regex"
        regex_pattern: "(?<=the answer is )(.*)(?=.)"
      - function: "take_first"
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -22,3 +22,5 @@ filter_list:
      - function: "regex"
        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
      - function: "take_first"
+metadata:
+  - version: 0
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -16,3 +16,5 @@ generation_kwargs:
    - "\n\n"
  do_sample: false
  temperature: 0.0
+metadata:
+  - version: 0
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -16,3 +16,5 @@ generation_kwargs:
    - "\n\n"
  do_sample: false
  temperature: 0.0
+metadata:
+  - version: 0
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
@@ -14,3 +14,5 @@ metric_list:
    aggregation: mean
    higher_is_better: true
    ignore_punctuation: true
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -11,3 +11,5 @@ doc_to_choice: "{{multiple_choice_targets}}"
 metric_list:
  - metric: acc
  # TODO: brier score and other metrics
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/blimp/_template_yaml
+++ b/lm_eval/tasks/blimp/_template_yaml
@@ -10,3 +10,5 @@ should_decontaminate: true
 doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:
  - metric: acc
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
@@ -16,4 +16,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: "1.0"
+  - version: 1.0
--- a/lm_eval/tasks/cmmlu/_default_template_yaml
+++ b/lm_eval/tasks/cmmlu/_default_template_yaml
@@ -15,3 +15,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0