Merge pull request #785 from EleutherAI/squadv2

[Refactor] squadv2

Merge pull request #785 from EleutherAI/squadv2
[Refactor] squadv2
54a53d6f · Lintang Sutawika · GitHub · 8043cb19 · d03974ad · 8043cb19
Unverified Commit 54a53d6f authored Sep 19, 2023 by Lintang Sutawika Committed by GitHub Sep 19, 2023
11 changed files
--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
-import os
-import yaml
-
-from lm_eval import utils
-from lm_eval.tasks import register_configurable_task, check_prompt_config
-from lm_eval.logger import eval_logger
-from lm_eval.api.registry import (
-    TASK_REGISTRY,
-    GROUP_REGISTRY,
-    ALL_TASKS,
-)
-
-
-def include_benchmarks(task_dir: str) -> None:
-    for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    try:
-                        benchmark_path = os.path.join(root, f)
-
-                        with open(benchmark_path, "rb") as file:
-                            yaml_config = yaml.full_load(file)
-
-                        assert "group" in yaml_config
-                        group = yaml_config["group"]
-                        all_task_list = yaml_config["task"]
-                        config_list = [
-                            task for task in all_task_list if type(task) != str
-                        ]
-                        task_list = [
-                            task for task in all_task_list if type(task) == str
-                        ]
-
-                        for task_config in config_list:
-                            var_configs = check_prompt_config(
-                                {
-                                    **task_config,
-                                    **{"group": group},
-                                }
-                            )
-                            for config in var_configs:
-                                register_configurable_task(config)
-
-                        task_names = utils.pattern_match(task_list, ALL_TASKS)
-                        for task in task_names:
-                            if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
-                                if group in GROUP_REGISTRY:
-                                    GROUP_REGISTRY[group].append(task)
-                                else:
-                                    GROUP_REGISTRY[group] = [task]
-                                    ALL_TASKS.add(group)
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load benchmark in\n"
-                            f"                                 {benchmark_path}\n"
-                            "                                 Benchmark will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
-
-
-task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_benchmarks(task_dir)
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,7 +11,6 @@ import numpy as np

 import lm_eval.api
 import lm_eval.tasks
-import lm_eval.benchmarks
 import lm_eval.models
 import lm_eval.api.metrics
 import lm_eval.api.registry

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -38,6 +38,34 @@ def register_configurable_task(config: Dict[str, str]) -> int:
    return 0


+def register_configurable_group(config: Dict[str, str]) -> int:
+    group = config["group"]
+    all_task_list = config["task"]
+    config_list = [task for task in all_task_list if type(task) != str]
+    task_list = [task for task in all_task_list if type(task) == str]
+
+    for task_config in config_list:
+        var_configs = check_prompt_config(
+            {
+                **task_config,
+                **{"group": group},
+            }
+        )
+        for config in var_configs:
+            register_configurable_task(config)
+
+    task_names = utils.pattern_match(task_list, ALL_TASKS)
+    for task in task_names:
+        if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
+            if group in GROUP_REGISTRY:
+                GROUP_REGISTRY[group].append(task)
+            else:
+                GROUP_REGISTRY[group] = [task]
+                ALL_TASKS.add(group)
+
+    return 0
+
+
 def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
    all_configs = []
    if "use_prompt" in config:
@@ -76,7 +104,7 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str:
        return "{dataset_path}".format(**task_config)


-def include_task_folder(task_dir: str) -> None:
+def include_task_folder(task_dir: str, register_task=True) -> None:
    """
    Calling this function
    """
@@ -87,9 +115,16 @@ def include_task_folder(task_dir: str) -> None:
                    yaml_path = os.path.join(root, f)
                    try:
                        config = utils.load_yaml_config(yaml_path)
-                        all_configs = check_prompt_config(config)
-                        for config in all_configs:
-                            register_configurable_task(config)
+
+                        if register_task:
+                            all_configs = check_prompt_config(config)
+                            for config in all_configs:
+                                register_configurable_task(config)
+                        else:
+                            # If a `task` in config is a list,
+                            # that means it's a benchmark
+                            if type(config["task"]) == list:
+                                register_configurable_group(config)

                    except Exception as error:
                        eval_logger.warning(
@@ -102,6 +137,8 @@ def include_task_folder(task_dir: str) -> None:

 task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
 include_task_folder(task_dir)
+# Register Benchmarks after all tasks have been added
+include_task_folder(task_dir, register_task=False)


 def get_task(task_name, config):

--- a/lm_eval/benchmarks/pythia.yaml
+++ b/lm_eval/benchmarks/pythia.yaml
--- a/lm_eval/benchmarks/t0_eval.yaml
+++ b/lm_eval/benchmarks/t0_eval.yaml
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
+# Task-name
+
+### Paper
+
+Title: `paper title goes here`
+Abstract: `link to paper PDF or arXiv abstract goes here`
+
+`Short description of paper / benchmark goes here:`
+
+Homepage: `homepage to the benchmark's website goes here, if applicable`
+
+
+### Citation
+
+```
+BibTeX-formatted citation goes here
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: .....
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
+task: squadv2
+dataset_path: squad_v2
+output_type: greedy_until
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
+include: default.yaml
+task: squadv2_noans_loglikelihood
+dataset_path: squad_v2
+output_type: loglikelihood
+training_split: train
+validation_split: validation
+doc_to_target: " unanswerable"
+metric_list:
+  - metric: perplexity
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
+import re
+import string
+import collections
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+# Exact match (the normalized answer exactly match the gold answer)
+def exact(predictions, references):
+    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
+
+
+# The F-score of predicted tokens versus the gold answer
+def f1(predictions, references):
+    gold_toks = get_tokens(references[0])
+    pred_toks = get_tokens(predictions[0])
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
+group: squadv2_complete
+task:
+  - squadv2
+  - squadv2_noans_loglikelihood
--- a/main.py
+++ b/main.py
@@ -11,7 +11,6 @@ from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
-from lm_eval.benchmarks import include_benchmarks

 os.environ["TOKENIZERS_PARALLELISM"] = "false"