Merge pull request #777 from EleutherAI/benchmark

[Refactor] Update Benchmark

Merge pull request #777 from EleutherAI/benchmark
[Refactor] Update Benchmark
be79a281 · Lintang Sutawika · GitHub · f918c8fd · 3de2c0e7 · be79a281
Unverified Commit be79a281 authored Aug 14, 2023 by Lintang Sutawika Committed by GitHub Aug 14, 2023
8 changed files
--- a/docs/advanced_task_guide.md
+++ b/docs/advanced_task_guide.md
@@ -236,3 +236,89 @@ Generative tasks:

 Tasks using complex filtering:
 - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)
+
+
+## Benchmarks
+
+When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task.
+
+To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite.
+
+```yaml
+group: pythia
+task:
+  - lambada_openai
+  - wikitext
+  - piqa
+  - sciq
+  - wsc
+  - winogrande
+  - arc
+  - logiqa
+  - blimp
+  - hendrycksTest*
+```
+
+Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set.
+
+```yaml
+group: t0_eval
+task:
+  # Coreference Resolution
+  - dataset_path: super_glue
+    dataset_name: wsc.fixed
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Coreference Resolution
+  - dataset_path: winogrande
+    dataset_name: winogrande_xl
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  ...
+```
+
+If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset.
+
+```YAML
+group: t0_eval
+task:
+  ...
+  - task: anli_r1
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r1
+    validation_split: dev_r1
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r2
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r2
+    validation_split: dev_r2
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+```
+
+Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/benchmarks/`
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -761,7 +761,12 @@ class ConfigurableTask(Task):
            return doc_to_text(doc)
        # Used when applying a Promptsource template
        elif hasattr(doc_to_text, "apply"):
-            return doc_to_text.apply(doc)[0]
+            applied_prompt = doc_to_text.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[0]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self._config.fewshot_delimiter
        else:
            print(type(doc_to_text))
            raise TypeError
@@ -791,7 +796,12 @@ class ConfigurableTask(Task):
            return doc_to_target(doc)
        # Used when applying a Promptsource template
        elif hasattr(doc_to_target, "apply"):
-            return doc_to_target.apply(doc)[1]
+            applied_prompt = doc_to_target.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[1]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self._config.fewshot_delimiter
        else:
            raise TypeError


--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
+import os
+import yaml
+
+from lm_eval import utils
+from lm_eval.tasks import register_configurable_task, check_prompt_config
+from lm_eval.logger import eval_logger
+from lm_eval.api.registry import (
+    TASK_REGISTRY,
+    GROUP_REGISTRY,
+    ALL_TASKS,
+)
+
+
+def include_benchmarks(task_dir):
+
+    for root, subdirs, file_list in os.walk(task_dir):
+        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
+            for f in file_list:
+                if f.endswith(".yaml"):
+                    try:
+                        benchmark_path = os.path.join(root, f)
+
+                        with open(benchmark_path, "rb") as file:
+                            yaml_config = yaml.full_load(file)
+
+                        assert "group" in yaml_config
+                        group = yaml_config["group"]
+                        all_task_list = yaml_config["task"]
+                        config_list = [
+                            task for task in all_task_list if type(task) != str
+                        ]
+                        task_list = [
+                            task for task in all_task_list if type(task) == str
+                        ]
+
+                        for task_config in config_list:
+                            var_configs = check_prompt_config(
+                                {
+                                    **task_config,
+                                    **{"group": group},
+                                }
+                            )
+                            for config in var_configs:
+                                register_configurable_task(config)
+
+                        task_names = utils.pattern_match(task_list, ALL_TASKS)
+                        for task in task_names:
+                            if task in TASK_REGISTRY:
+                                if group in GROUP_REGISTRY:
+                                    GROUP_REGISTRY[group].append(task)
+                                else:
+                                    GROUP_REGISTRY[group] = [task]
+                                    ALL_TASKS.add(group)
+                    except Exception as error:
+                        eval_logger.warning(
+                            "Failed to load benchmark in\n"
+                            f"                                 {benchmark_path}\n"
+                            "                                 Benchmark will not be added to registry\n"
+                            f"                                 Error: {error}"
+                        )
+
+
+task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
+include_benchmarks(task_dir)
--- a/lm_eval/tasks/benchmarks/pythia.yaml
+++ b/lm_eval/tasks/benchmarks/pythia.yaml
@@ -6,7 +6,7 @@ task:
  - sciq
  - wsc
  - winogrande
-  - arc_*
-  # - logiqa
-  # - blimp_*
-  # - hendrycksTest*
+  - arc
+  - logiqa
+  - blimp
+  - hendrycksTest*
--- a/lm_eval/benchmarks/t0_eval.yaml
+++ b/lm_eval/benchmarks/t0_eval.yaml
+group: t0_eval
+task:
+  # Coreference Resolution
+  - dataset_path: super_glue
+    dataset_name: wsc.fixed
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Coreference Resolution
+  - dataset_path: winogrande
+    dataset_name: winogrande_xl
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Natural Language Inference
+  - dataset_path: super_glue
+    dataset_name: cb
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: greedy_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r1
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r1
+    validation_split: dev_r1
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r2
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r2
+    validation_split: dev_r2
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r3
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r3
+    validation_split: dev_r3
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Sentence Completion
+  - dataset_path: super_glue
+    dataset_name: copa
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Natural Language Inference
+  - dataset_path: hellaswag
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Word Sense Disambiguation
+  - dataset_path: super_glue
+    dataset_name: wic
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,6 +11,7 @@ import numpy as np

 import lm_eval.api
 import lm_eval.tasks
+import lm_eval.benchmarks
 import lm_eval.models
 import lm_eval.api.metrics
 import lm_eval.api.registry

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -44,7 +44,7 @@ def check_prompt_config(config):
        prompt_list = prompts.load_prompt_list(
            use_prompt=config["use_prompt"],
            dataset_name=config["dataset_path"],
-            subset_name=config["dataset_name"],
+            subset_name=config["dataset_name"] if "dataset_name" in config else None,
        )
        for idx, prompt_variation in enumerate(prompt_list):
            all_configs.append(
@@ -54,7 +54,9 @@ def check_prompt_config(config):
                    **{
                        "task": "_".join(
                            [
-                                get_task_name_from_config(config),
+                                config["task"]
+                                if "task" in config
+                                else get_task_name_from_config(config),
                                prompt_variation,
                            ]
                        )
@@ -98,58 +100,8 @@ def include_task_folder(task_dir):
                        )


-def include_benchmarks(task_dir, benchmark_dir="benchmarks"):
-
-    for root, subdirs, file_list in os.walk(os.path.join(task_dir, benchmark_dir)):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    try:
-                        benchmark_path = os.path.join(root, f)
-
-                        with open(benchmark_path, "rb") as file:
-                            yaml_config = yaml.full_load(file)
-
-                        assert "group" in yaml_config
-                        group = yaml_config["group"]
-                        all_task_list = yaml_config["task"]
-                        config_list = [
-                            task for task in all_task_list if type(task) != str
-                        ]
-                        task_list = [
-                            task for task in all_task_list if type(task) == str
-                        ]
-
-                        for task_config in config_list:
-                            var_configs = check_prompt_config(
-                                {
-                                    **task_config,
-                                    **{"group": group},
-                                }
-                            )
-                            for config in var_configs:
-                                register_configurable_task(config)
-
-                        task_names = utils.pattern_match(task_list, ALL_TASKS)
-                        for task in task_names:
-                            if task in TASK_REGISTRY:
-                                if group in GROUP_REGISTRY:
-                                    GROUP_REGISTRY[group].append(task)
-                                else:
-                                    GROUP_REGISTRY[group] = [task]
-                                    ALL_TASKS.add(group)
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load benchmark in\n"
-                            f"                                 {benchmark_path}\n"
-                            "                                 Benchmark will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
-
-
 task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
 include_task_folder(task_dir)
-include_benchmarks(task_dir)


 def get_task(task_name, config):

--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
-group: t0_eval
-task:
-  # # Coreference Resolution
-  # - dataset_path: super_glue
-  #   dataset_name: wsc.fixed
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # # Coreference Resolution
-  # - dataset_path: winogrande
-  #   dataset_name: winogrande_xl
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # Natural Language Inference
-  - dataset_path: super_glue
-    dataset_name: cb
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    output_type: greedy_until
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Natural Language Inference
-  # - dataset_path: super_glue
-  #   dataset_name: rte
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # # Natural Language Inference
-  # # - dataset_path: anli
-  # #   use_prompt: promptsource:*
-  # #   training_split: train_r1
-  # #   validation_split: dev_r1
-  # # Sentence Completion
-  # - dataset_path: super_glue
-  #   dataset_name: copa
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # # Natural Language Inference
-  # - dataset_path: hellaswag
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true
-  # # Word Sense Disambiguation
-  # - dataset_path: super_glue
-  #   dataset_name: wic
-  #   use_prompt: promptsource:*
-  #   training_split: train
-  #   validation_split: validation
-  #   metric_list:
-  #     - metric: exact_match
-  #       aggregation: mean
-  #       higher_is_better: true
-  #       ignore_case: true
-  #       ignore_punctuation: true