Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into superglue

33f2f9bf · lintangsutawika · e1fdf2a8 · 7634a6ec · 33f2f9bf · 33f2f9bf
Commit 33f2f9bf authored Aug 10, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/blimp/principle_A_domain_3.yaml
+++ b/lm_eval/tasks/blimp/principle_A_domain_3.yaml
+# Generated by utils.py
+dataset_name: principle_A_domain_3
+include: template_yaml
+task: blimp_principle_A_domain_3
--- a/lm_eval/tasks/blimp/principle_A_reconstruction.yaml
+++ b/lm_eval/tasks/blimp/principle_A_reconstruction.yaml
+# Generated by utils.py
+dataset_name: principle_A_reconstruction
+include: template_yaml
+task: blimp_principle_A_reconstruction
--- a/lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_1.yaml
+++ b/lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_1.yaml
+# Generated by utils.py
+dataset_name: regular_plural_subject_verb_agreement_1
+include: template_yaml
+task: blimp_regular_plural_subject_verb_agreement_1
--- a/lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_2.yaml
+++ b/lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_2.yaml
+# Generated by utils.py
+dataset_name: regular_plural_subject_verb_agreement_2
+include: template_yaml
+task: blimp_regular_plural_subject_verb_agreement_2
--- a/lm_eval/tasks/blimp/sentential_negation_npi_licensor_present.yaml
+++ b/lm_eval/tasks/blimp/sentential_negation_npi_licensor_present.yaml
+# Generated by utils.py
+dataset_name: sentential_negation_npi_licensor_present
+include: template_yaml
+task: blimp_sentential_negation_npi_licensor_present
--- a/lm_eval/tasks/blimp/sentential_negation_npi_scope.yaml
+++ b/lm_eval/tasks/blimp/sentential_negation_npi_scope.yaml
+# Generated by utils.py
+dataset_name: sentential_negation_npi_scope
+include: template_yaml
+task: blimp_sentential_negation_npi_scope
--- a/lm_eval/tasks/blimp/sentential_subject_island.yaml
+++ b/lm_eval/tasks/blimp/sentential_subject_island.yaml
+# Generated by utils.py
+dataset_name: sentential_subject_island
+include: template_yaml
+task: blimp_sentential_subject_island
--- a/lm_eval/tasks/blimp/superlative_quantifiers_1.yaml
+++ b/lm_eval/tasks/blimp/superlative_quantifiers_1.yaml
+# Generated by utils.py
+dataset_name: superlative_quantifiers_1
+include: template_yaml
+task: blimp_superlative_quantifiers_1
--- a/lm_eval/tasks/blimp/superlative_quantifiers_2.yaml
+++ b/lm_eval/tasks/blimp/superlative_quantifiers_2.yaml
+# Generated by utils.py
+dataset_name: superlative_quantifiers_2
+include: template_yaml
+task: blimp_superlative_quantifiers_2
--- a/lm_eval/tasks/blimp/template_yaml
+++ b/lm_eval/tasks/blimp/template_yaml
+group: blimp
+dataset_path: blimp
+output_type: multiple_choice
+validation_split: validation
+doc_to_text: ""
+doc_to_target: 0
+doc_to_choice: "{{[sentence_good, sentence_bad]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/blimp/tough_vs_raising_1.yaml
+++ b/lm_eval/tasks/blimp/tough_vs_raising_1.yaml
+# Generated by utils.py
+dataset_name: tough_vs_raising_1
+include: template_yaml
+task: blimp_tough_vs_raising_1
--- a/lm_eval/tasks/blimp/tough_vs_raising_2.yaml
+++ b/lm_eval/tasks/blimp/tough_vs_raising_2.yaml
+# Generated by utils.py
+dataset_name: tough_vs_raising_2
+include: template_yaml
+task: blimp_tough_vs_raising_2
--- a/lm_eval/tasks/blimp/transitive.yaml
+++ b/lm_eval/tasks/blimp/transitive.yaml
+# Generated by utils.py
+dataset_name: transitive
+include: template_yaml
+task: blimp_transitive
--- a/lm_eval/tasks/blimp/wh_island.yaml
+++ b/lm_eval/tasks/blimp/wh_island.yaml
+# Generated by utils.py
+dataset_name: wh_island
+include: template_yaml
+task: blimp_wh_island
--- a/lm_eval/tasks/blimp/wh_questions_object_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_object_gap.yaml
+# Generated by utils.py
+dataset_name: wh_questions_object_gap
+include: template_yaml
+task: blimp_wh_questions_object_gap
--- a/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml
+# Generated by utils.py
+dataset_name: wh_questions_subject_gap
+include: template_yaml
+task: blimp_wh_questions_subject_gap
--- a/lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml
+# Generated by utils.py
+dataset_name: wh_questions_subject_gap_long_distance
+include: template_yaml
+task: blimp_wh_questions_subject_gap_long_distance
--- a/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml
+# Generated by utils.py
+dataset_name: wh_vs_that_no_gap
+include: template_yaml
+task: blimp_wh_vs_that_no_gap
--- a/lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml
+# Generated by utils.py
+dataset_name: wh_vs_that_no_gap_long_distance
+include: template_yaml
+task: blimp_wh_vs_that_no_gap_long_distance
--- a/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml
+# Generated by utils.py
+dataset_name: wh_vs_that_with_gap
+include: template_yaml
+task: blimp_wh_vs_that_with_gap