Merge branch 'big-refactor' into unscramble+toxigen

e0417b3b · Hailey Schoelkopf · GitHub · 907c99c0 · 7d4e92fa · e0417b3b
Unverified Commit e0417b3b authored Jul 12, 2023 by Hailey Schoelkopf Committed by GitHub Jul 12, 2023
13 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -101,13 +101,29 @@ class TaskConfig(dict):
            if type(self.gold_alias) == str:
                self.gold_alias = self.template_aliases + self.gold_alias
-        if self.generation_kwargs:
+        if self.generation_kwargs is not None:
-            assert (
+            if self.output_type != "greedy_until":
-                self.output_type == "greedy_until"
+                eval_logger.warning(
-            ), "passed `generation_kwargs`, but not using a generation request type!"
+                    "passed `generation_kwargs`, but not using a generation request type!"
-        elif self.output_type == "greedy_until":
+                )
-            # ensure that we greedily generate in absence of explicit arguments otherwise
-            self.generation_kwargs = {"do_sample": False, "temperature": 0.0}
+            if "temperature" in self.generation_kwargs:
+                self.generation_kwargs["temperature"] = float(
+                    self.generation_kwargs["temperature"]
+                )
+            if "until" not in self.generation_kwargs:
+                self.generation_kwargs["until"] = [self.fewshot_delimiter]
+        else:
+            if self.output_type == "greedy_until":
+                # ensure that we greedily generate in absence of explicit arguments otherwise
+                self.generation_kwargs = {
+                    "until": None
+                    if self.fewshot_delimiter is None
+                    else [self.fewshot_delimiter],
+                    "do_sample": False,
+                    "temperature": 0.0,
+                }
        # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -12,28 +12,27 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] ~~Lambada (Multilingual)~~
 - [x] Wikitext
 - [x] PiQA
- [ ] PROST (WIP)
+- [x] PROST
 - [ ] MCTACO
 - [x] Pubmed QA
 - [x] SciQ
 - [ ] QASPER
- [ ] QA4MRE (WIP)
+- [x] QA4MRE
 - [ ] TriviaQA
 - [x] AI2 ARC
- [ ] LogiQA
+- [ ] LogiQA (WIP)
 - [x] HellaSwag
 - [x] SWAG
 - [x] OpenBookQA
- [ ] SQuADv2
 - [x] RACE
 - [ ] LogiQA (WIP)
 - [x] HellaSwag
- [ ] SWAG (WIP)
+- [x] SWAG
 - [x] OpenBookQA
 - [ ] SQuADv2 (WIP)
- [ ] RACE (WIP)
+- [x] RACE
- [ ] HeadQA (WIP)
+- [x] HeadQA (WIP)
- [ ] MathQA
+- [ ] MathQA (WIP)
 - [ ] WebQs
 - [ ] WSC273
 - [x] Winogrande
@@ -42,7 +41,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] TruthfulQA
 - [ ] MuTual
 - [ ] Hendrycks Math (WIP)
- [ ] Asdiv
+- [ ] Asdiv (WIP)
 - [ ] GSM8k
 - [x] Arithmetic
 - [ ] MMMLU

--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -19,6 +19,3 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
-  - metric: acc_mutual_info
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/headqa/README.md
+++ b/lm_eval/tasks/headqa/README.md
+# HEAD-QA
+### Paper
+HEAD-QA: A Healthcare Dataset for Complex Reasoning
+https://arxiv.org/pdf/1906.04701.pdf
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
+de Sanidad, Consumo y Bienestar Social.
+The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.
+Homepage: https://aghie.github.io/head-qa/
+### Citation
+```
+@inproceedings{vilares-gomez-rodriguez-2019-head,
+    title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
+    author = "Vilares, David  and
+      G{\'o}mez-Rodr{\'i}guez, Carlos",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1092",
+    doi = "10.18653/v1/P19-1092",
+    pages = "960--966",
+    abstract = "We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.",
+}
+```
+### Subtasks
+* `headqa_en` - English variant of HEAD-QA
+* `headqa_es` - Spanish variant of HEAD-QA
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?\
+  * [x] Same as LM Evaluation Harness v0.3.0 implementation
--- a/lm_eval/tasks/headqa/headqa_en.yaml
+++ b/lm_eval/tasks/headqa/headqa_en.yaml
+group:
+  - multiple_choice
+task: headqa_en
+dataset_path: EleutherAI/headqa
+dataset_name: en
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+template_aliases: "{% set answer_choices = answers|map(attribute='atext')|list %}{% set gold = ra - 1 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
+doc_to_text: "Question: {{qtext}}\nAnswer:"
+doc_to_target: "{{answer_choices[gold]}}"
+gold_alias: "{{gold}}" # this will be cast to an int.
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/headqa/headqa_es.yaml
+++ b/lm_eval/tasks/headqa/headqa_es.yaml
+include: headqa_en.yaml
+task: headqa_es
+dataset_name: es
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -18,6 +18,3 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
-  - metric: acc_mutual_info
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/qa4mre/preprocess_qa4mre.py
+++ b/lm_eval/tasks/qa4mre/preprocess_qa4mre.py
+def qa4mre_process(doc):
+    return int(doc["correct_answer_id"]) - 1
+def doc_to_target(doc):
+    return doc["answer_options"]["answer_str"][qa4mre_process(doc)]
--- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+group:
+  - multiple_choice
+task: qa4mre_2011
+dataset_path: qa4mre
+dataset_name: 2011.main.EN
+output_type: multiple_choice
+test_split: train
+template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
+doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
+doc_to_target: !function preprocess_qa4mre.doc_to_target
+gold_alias: !function preprocess_qa4mre.qa4mre_process
+should_decontaminate: true
+doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qa4mre/qa4mre_2012.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2012.yaml
+group:
+  - multiple_choice
+task: qa4mre_2012
+dataset_path: qa4mre
+dataset_name: 2012.main.EN
+output_type: multiple_choice
+test_split: train
+template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
+doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
+doc_to_target: !function preprocess_qa4mre.doc_to_target
+gold_alias: !function preprocess_qa4mre.qa4mre_process
+should_decontaminate: true
+doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qa4mre/qa4mre_2013.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2013.yaml
+group:
+  - multiple_choice
+task: qa4mre_2013
+dataset_path: qa4mre
+dataset_name: 2013.main.EN
+output_type: multiple_choice
+test_split: train
+template_aliases: "{% set answer_choices = answer_options['answer_str'] %}"
+doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:"
+doc_to_target: !function preprocess_qa4mre.doc_to_target
+gold_alias: !function preprocess_qa4mre.qa4mre_process
+should_decontaminate: true
+doc_to_decontamination_query: "{{document_str.strip()}} + ' ' + {{question_str}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
 group:
  - multiple_choice
 task: race
-dataset_path: bfattori/race
+dataset_path: EleutherAI/race
 dataset_name: high
 output_type: multiple_choice
 test_split: test
@@ -11,4 +11,4 @@ doc_to_target: !function preprocess_race.doc_to_target
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
\ No newline at end of file
--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
@@ -18,6 +18,3 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
-  - metric: acc_mutual_info
-    aggregation: mean
-    higher_is_better: true