merged latest and added altworld files

835cc40e · lintangsutawika · 8da401e0 · c9bbec6e · 835cc40e · 835cc40e
Commit 835cc40e authored Dec 06, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/blimp/tough_vs_raising_2.yaml
+++ b/lm_eval/tasks/blimp/tough_vs_raising_2.yaml
 # Generated by utils.py
 dataset_name: tough_vs_raising_2
-include: template_yaml
+include: _template_yaml
 task: blimp_tough_vs_raising_2
--- a/lm_eval/tasks/blimp/transitive.yaml
+++ b/lm_eval/tasks/blimp/transitive.yaml
 # Generated by utils.py
 dataset_name: transitive
-include: template_yaml
+include: _template_yaml
 task: blimp_transitive
--- a/lm_eval/tasks/blimp/wh_island.yaml
+++ b/lm_eval/tasks/blimp/wh_island.yaml
 # Generated by utils.py
 dataset_name: wh_island
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_island
--- a/lm_eval/tasks/blimp/wh_questions_object_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_object_gap.yaml
 # Generated by utils.py
 dataset_name: wh_questions_object_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_questions_object_gap
--- a/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml
 # Generated by utils.py
 dataset_name: wh_questions_subject_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_questions_subject_gap
--- a/lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml
 # Generated by utils.py
 dataset_name: wh_questions_subject_gap_long_distance
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_questions_subject_gap_long_distance
--- a/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_no_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_no_gap
--- a/lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_no_gap_long_distance
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_no_gap_long_distance
--- a/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_with_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_with_gap
--- a/lm_eval/tasks/blimp/wh_vs_that_with_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_with_gap_long_distance.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_with_gap_long_distance
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_with_gap_long_distance
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
@@ -16,4 +16,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: "1.0"
+  - version: 1.0
--- a/lm_eval/tasks/cmmlu/_default_template_yaml
+++ b/lm_eval/tasks/cmmlu/_default_template_yaml
@@ -15,3 +15,5 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -17,3 +17,5 @@ metric_list:
  - metric: !function bleu.smoothed_bleu_4
    aggregation: mean
    higher_is_better: True
+metadata:
+  - version: 2.0
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -18,3 +18,5 @@ metric_list:
  - metric: f1
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 2.0
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
@@ -19,3 +19,5 @@ metric_list:
  - metric: pct_stereotype
    aggregation: mean
    higher_is_better: false
+metadata:
+  - version: 1.0