Merge branch 'big-refactor' into bump-deps

4c139701 · haileyschoelkopf · 5794ec3c · cc547c7b · 4c139701 · 4c139701
Commit 4c139701 authored Sep 15, 2023 by haileyschoelkopf
7 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -63,10 +63,10 @@ jobs:
      - name: Test with pytest
        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -674,11 +674,11 @@ class ConfigurableTask(Task):
            check_choices = test_choice
        else:
            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
            for choice in check_choices:
-            choice_has_whitespace = True if " " in choice else False
+                choice_has_whitespace = True if choice[0].isspace() else False
                delimiter_has_whitespace = (
-                True if " " in self.config.target_delimiter else False
+                    True if self.config.target_delimiter[-1].isspace() else False
                )
                if delimiter_has_whitespace and choice_has_whitespace:
@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
            else:
                gold = str(gold)
@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
                    # return true if any are true
                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
                    for gold_option in gold:
                        try:
                            result_score = self._metric_fn_list[metric](

--- a/lm_eval/tasks/nq_open/README.md
+++ b/lm_eval/tasks/nq_open/README.md
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
+task: nq_open
+dataset_path: nq_open
+output_type: greedy_until
+training_split: train
+validation_split: validation
+description: "Answer these questions:\n"
+doc_to_text: "Q: {{question}}?\nA:"
+doc_to_target: "{{answer}}" # TODO: should be multi-target
+fewshot_delimiter: "\n"
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+    - "\ban|a|the\b"
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -38,12 +38,14 @@ def main():
        iters = []
        for set in args.sets.split(","):
+            docs = None
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
+            if docs is not None:
                iters.append(docs)
        docs = join_iters(iters)

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
+from typing import List
 import random
 import pytest
@@ -26,7 +27,7 @@ import pytest
        )
    ],
 )
-def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -9,6 +9,7 @@ import os
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
    return list(_output)
-def new_tasks() -> Union[list[str], None]:
+def new_tasks() -> Union[List[str], None]:
    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
    if os.path.exists(FILENAME):
        # If tasks folder has changed then we get the list of files from FILENAME