added lama primed and negated dataset

97e3c9fe · lintangsutawika · 7852985b · 97e3c9fe · 97e3c9fe · 97e3c9fe
Commit 97e3c9fe authored Apr 18, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/lama_primed_negated/README.md
+++ b/lm_eval/tasks/lama_primed_negated/README.md
+# LAMA Primed Negated
+### Paper
+Title: 
+Abstract: 
+Homepage: 
+### Citation
+```
+```
+### Groups and Tasks
+#### Groups
+#### Tasks
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/_misprimed_correct_incorrect_template_yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/_misprimed_correct_incorrect_template_yaml
+group: lama_misprimed_correct_incorrect
+dataset_path: lintang/lama_primed_negated
+output_type: multiple_choice
+# description: "Answer the following question with either Correct or Incorrect. "
+doc_to_text: "Question: {{masked_misprimed[0]|replace('[MASK]', obj_label)}} Correct or Incorrect?\n\nAnswer:"
+doc_to_target: "Correct"
+doc_to_choice: ["Correct", "Incorrect"]
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{masked_misprimed[0]|replace('[MASK]', obj_label)}} Correct or Incorrect?\n\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: Correct
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: Correct
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/_negated_correct_incorrect_template_yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/_negated_correct_incorrect_template_yaml
+group: lama_negated_correct_incorrect
+dataset_path: lintang/lama_primed_negated
+output_type: multiple_choice
+# description: "Answer the following question with either Correct or Incorrect. "
+doc_to_text: "Question: {{masked_negations[0]|replace('[MASK]', obj_label)}} Correct or Incorrect?\n\nAnswer:"
+doc_to_target: "Incorrect"
+doc_to_choice: ["Correct", "Incorrect"]
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{masked_negations[0]|replace('[MASK]', obj_label)}} Correct or Incorrect?\n\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: Correct
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: Correct
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/_normal_correct_incorrect_template_yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/_normal_correct_incorrect_template_yaml
+group: lama_normal_correct_incorrect
+dataset_path: lintang/lama_primed_negated
+output_type: multiple_choice
+# description: "Answer the following question with either Correct or Incorrect. "
+doc_to_text: "Question: {{masked_sentences[0]|replace('[MASK]', obj_label)}} Correct or Incorrect?\n\nAnswer:"
+doc_to_target: "Correct"
+doc_to_choice: ["Correct", "Incorrect"]
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{masked_sentences[0]|replace('[MASK]', obj_label)}} Correct or Incorrect?\n\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: Correct
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: Correct
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_conceptnet.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_conceptnet.yaml
+include: _misprimed_correct_incorrect_template_yaml
+task: lama_misprimed_conceptnet_correct_incorrect
+dataset_name: ConceptNet
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_google_re.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_google_re.yaml
+include: _misprimed_correct_incorrect_template_yaml
+task: lama_misprimed_google_re_correct_incorrect
+dataset_name: GoogleRE
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_squad.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_squad.yaml
+include: _misprimed_correct_incorrect_template_yaml
+task: lama_misprimed_squad_correct_incorrect
+dataset_name: SQUAD
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_trex.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/misprimed_trex.yaml
+include: _misprimed_correct_incorrect_template_yaml
+task: lama_misprimed_trex_correct_incorrect
+dataset_name: TREx
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_conceptnet.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_conceptnet.yaml
+include: _negated_correct_incorrect_template_yaml
+task: lama_negated_conceptnet_correct_incorrect
+dataset_name: ConceptNet
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_google_re.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_google_re.yaml
+include: _negated_correct_incorrect_template_yaml
+task: lama_negated_google_re_correct_incorrect
+dataset_name: GoogleRE
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_squad.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_squad.yaml
+include: _negated_correct_incorrect_template_yaml
+task: lama_negated_squad_correct_incorrect
+dataset_name: SQUAD
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_trex.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/negated_trex.yaml
+include: _negated_correct_incorrect_template_yaml
+task: lama_negated_trex_correct_incorrect
+dataset_name: TREx
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_conceptnet.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_conceptnet.yaml
+include: _normal_correct_incorrect_template_yaml
+task: lama_normal_conceptnet_correct_incorrect
+dataset_name: ConceptNet
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_google_re.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_google_re.yaml
+include: _normal_correct_incorrect_template_yaml
+task: lama_normal_google_re_correct_incorrect
+dataset_name: GoogleRE
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_squad.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_squad.yaml
+include: _normal_correct_incorrect_template_yaml
+task: lama_normal_squad_correct_incorrect
+dataset_name: SQUAD
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_trex.yaml
+++ b/lm_eval/tasks/lama_primed_negated/correct_incorrect/normal_trex.yaml
+include: _normal_correct_incorrect_template_yaml
+task: lama_normal_trex_correct_incorrect
+dataset_name: TREx
+test_split: high_ranked
--- a/lm_eval/tasks/lama_primed_negated/right_wrong/_misprimed_right_wrong_template_yaml
+++ b/lm_eval/tasks/lama_primed_negated/right_wrong/_misprimed_right_wrong_template_yaml
+group: lama_misprimed_right_wrong
+dataset_path: lintang/lama_primed_negated
+output_type: multiple_choice
+# description: "Answer the following question with either Right or Wrong. "
+doc_to_text: "Question: {{masked_misprimed[0]|replace('[MASK]', obj_label)}} Right or Wrong?\n\nAnswer:"
+doc_to_target: "Right"
+doc_to_choice: ["Right", "Wrong"]
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{masked_misprimed[0]|replace('[MASK]', obj_label)}} Right or Wrong?\n\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: Right
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: Right
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/lama_primed_negated/right_wrong/_negated_right_wrong_template_yaml
+++ b/lm_eval/tasks/lama_primed_negated/right_wrong/_negated_right_wrong_template_yaml
+group: lama_negated_right_wrong
+dataset_path: lintang/lama_primed_negated
+output_type: multiple_choice
+# description: "Answer the following question with either Right or Wrong. "
+doc_to_text: "Question: {{masked_negations[0]|replace('[MASK]', obj_label)}} Right or Wrong?\n\nAnswer:"
+doc_to_target: "Wrong"
+doc_to_choice: ["Right", "Wrong"]
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{masked_negations[0]|replace('[MASK]', obj_label)}} Right or Wrong?\n\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: Right
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: Right
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/lama_primed_negated/right_wrong/_normal_right_wrong_template_yaml
+++ b/lm_eval/tasks/lama_primed_negated/right_wrong/_normal_right_wrong_template_yaml
+group: lama_normal_right_wrong
+dataset_path: lintang/lama_primed_negated
+output_type: multiple_choice
+# description: "Answer the following question with either Right or Wrong. "
+doc_to_text: "Question: {{masked_sentences[0]|replace('[MASK]', obj_label)}} Right or Wrong?\n\nAnswer:"
+doc_to_target: "Right"
+doc_to_choice: ["Right", "Wrong"]
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{masked_sentences[0]|replace('[MASK]', obj_label)}} Right or Wrong?\n\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: Right
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: Right
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/lama_primed_negated/right_wrong/misprimed_conceptnet.yaml
+++ b/lm_eval/tasks/lama_primed_negated/right_wrong/misprimed_conceptnet.yaml
+include: _misprimed_right_wrong_template_yaml
+task: lama_misprimed_conceptnet_right_wrong
+dataset_name: ConceptNet
+test_split: high_ranked