add versions

0d03a9f3 · lintangsutawika · bf26d979 · 0d03a9f3 · 0d03a9f3 · 0d03a9f3
Commit 0d03a9f3 authored Nov 28, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -91,7 +91,7 @@ class TaskConfig(dict):
    should_decontaminate: bool = False
    doc_to_decontamination_query: str = None
-    metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    metadata: Union[str, list] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks
    def __post_init__(self) -> None:
        if self.dataset_path and ("." in self.dataset_path):

--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
@@ -22,3 +22,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
\ No newline at end of file
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -19,3 +19,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
\ No newline at end of file
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -12,3 +12,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
\ No newline at end of file
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -10,3 +10,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
\ No newline at end of file
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -16,3 +16,5 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -24,3 +24,5 @@ filter_list:
      - function: "regex"
        regex_pattern: "(?<=the answer is )(.*)(?=.)"
      - function: "take_first"
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -22,3 +22,5 @@ filter_list:
      - function: "regex"
        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
      - function: "take_first"
+metadata:
+  - version: 0
\ No newline at end of file
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -16,3 +16,5 @@ generation_kwargs:
    - "\n\n"
  do_sample: false
  temperature: 0.0
+metadata:
+  - version: 0
\ No newline at end of file
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -16,3 +16,5 @@ generation_kwargs:
    - "\n\n"
  do_sample: false
  temperature: 0.0
+metadata:
+  - version: 0
\ No newline at end of file
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
@@ -14,3 +14,5 @@ metric_list:
    aggregation: mean
    higher_is_better: true
    ignore_punctuation: true
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -11,3 +11,5 @@ doc_to_choice: "{{multiple_choice_targets}}"
 metric_list:
  - metric: acc
  # TODO: brier score and other metrics
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/blimp/template_yaml
+++ b/lm_eval/tasks/blimp/template_yaml
@@ -9,3 +9,5 @@ should_decontaminate: true
 doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:
  - metric: acc
+metadata:
+  - version: 1.0
\ No newline at end of file
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
@@ -16,4 +16,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: "1.0"
+  - version: 1.0
--- a/lm_eval/tasks/cmmlu/_default_template_yaml
+++ b/lm_eval/tasks/cmmlu/_default_template_yaml
@@ -15,3 +15,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
\ No newline at end of file