Merge branch 'big-refactor' into seq2seq-support

f38f8e20 · haileyschoelkopf · a6c640d3 · 41677741 · f38f8e20 · f38f8e20
Commit f38f8e20 authored Jun 19, 2023 by haileyschoelkopf
5 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -73,7 +73,7 @@ class TaskConfig(dict):
    repeats: int = 1
    metric_list: str = None
-    gold_alias: str = None
+    gold_alias: Union[Callable, str] = None
    output_type: str = "greedy_until"
    generation_kwargs: dict = None
    delimiter: str = "\n\n"
@@ -95,7 +95,7 @@ class TaskConfig(dict):
                self.doc_to_target = self.template_aliases + self.doc_to_target
            if type(self.gold_alias) == str:
-                self.gold_alias = self.template_aliases + self.doc_to_target
+                self.gold_alias = self.template_aliases + self.gold_alias
        if self.generation_kwargs or self.output_type == "greedy_until":
            assert (
@@ -737,10 +737,11 @@ class ConfigurableTask(Task):
    def gold_alias(self, doc):
        # TODO: reevaluate if we need this. implemented to have a
        # processed version of answer to put into gsm8k exact_match scoring as ref.
-        if self._config.gold_alias:
+        if self._config.gold_alias is not None:
            doc_to_target = self._config.gold_alias
        else:
-            doc_to_target = self._config.doc_to_target
+            # doc_to_target = self._config.doc_to_target
+            return self.doc_to_target(doc)
        if type(doc_to_target) == str:
            return utils.apply_template(doc_to_target, doc)
@@ -842,7 +843,11 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":
            lls, is_greedy = zip(*results)
+            if self._config.gold_alias is not None:
+                gold = int(self.gold_alias(doc))
+            else:
                gold = int(self.doc_to_target(doc))
            pred = np.argmax(lls)
            # retrieve choices in List[str] form, to compute choice lengths, etc.
            choices = ast.literal_eval(

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -23,7 +23,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] LogiQA
 - [ ] HellaSwag
 - [ ] SWAG
- [ ] OpenBookQA
+- [x] OpenBookQA
 - [ ] SQuADv2
 - [ ] RACE
 - [ ] HeadQA

--- a/lm_eval/tasks/openbookqa/openbookqa.yaml
+++ b/lm_eval/tasks/openbookqa/openbookqa.yaml
+group:
+  - multiple_choice
+task: openbookqa
+dataset_path: openbookqa
+dataset_name: main
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey.lstrip()) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
+doc_to_text: "{{question_stem}}"
+doc_to_target: "{{gold}}" # this will be cast to an int.
+should_decontaminate: true
+doc_to_decontamination_query: "{{question_stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
@@ -9,7 +9,8 @@ validation_split: validation
 test_split: test
 template_aliases: "{% set answer_choices = [distractor1, distractor2, distractor3, correct_answer] %}{% set gold = 3 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
 doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:"
-doc_to_target: "{{gold}}" # this will be cast to an int.
+doc_to_target: " {{correct_answer}}"
+gold_alias: "{{gold}}" # this will be cast to an int.
 metric_list:
  - metric: acc
    aggregation: mean

--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,12 @@ setuptools.setup(
    long_description_content_type="text/markdown",
    url="https://github.com/EleutherAI/lm-evaluation-harness",
    packages=setuptools.find_packages(),
+    # required to include yaml files in pip installation
+    package_data={
+        "lm_eval": ["**/*.yaml"],
+        "examples": ["**/*.yaml"],
+    },
+    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Programming Language :: Python :: 3",