merged

17396935 · lintangsutawika · cd8642e7 · 458342e2 · 17396935 · 17396935
Commit 17396935 authored Aug 05, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
+from functools import partial
 import datasets
@@ -15,8 +17,37 @@ class ContextSampler:
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
            self.doc_to_text = self.task.doc_to_text
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
            self.doc_to_target = self.task.doc_to_target
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
            self.doc_to_choice = self.task.doc_to_choice
        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
@@ -52,6 +83,7 @@ class ContextSampler:
                else self.doc_to_choice(doc)[doc_content]
            )
            labeled_examples += self.target_delimiter
+<<<<<<< HEAD
            labeled_examples += (
                str(doc_target[0])
                if isinstance(doc_target, list)
@@ -60,6 +92,17 @@ class ContextSampler:
                else str(self.doc_to_choice(doc)[doc_target])
            )
            labeled_examples += self.fewshot_delimiter
+=======
+            if doc_target != "":
+                labeled_examples += (
+                    str(doc_target[0])
+                    if isinstance(doc_target, list)
+                    else doc_target
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
+>>>>>>> mmlu-pro-changes
        return labeled_examples

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task):
        """
        return doc
-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc, doc_to_text=None):
        if self.prompt is not None:
            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
        else:
            doc_to_text = self.config.doc_to_text
@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task):
            print(type(doc_to_text))
            raise TypeError
-    def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
        if self.prompt is not None:
            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
        else:
            doc_to_target = self.config.doc_to_target
@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task):
        else:
            raise TypeError
-    def doc_to_choice(self, doc: Any) -> List[str]:
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -607,6 +607,78 @@ def evaluate(
                            _higher_is_better[m] = None
                higher_is_better[group] = _higher_is_better
+<<<<<<< HEAD
+=======
+                # collect all metric keys used by a subtask in the group.
+                metric_list = list(
+                    {
+                        key
+                        for task in task_list
+                        for key in results[task].keys()
+                        if "_stderr" not in key and key not in ["alias", "samples"]
+                    }
+                )
+                for metric in metric_list:
+                    stderr = "_stderr,".join(metric.split(","))
+                    # gather metrics, sizes, and stderrs from subtasks
+                    metrics = [
+                        results[task][metric]
+                        for task in task_list
+                        if metric in results[task]
+                    ]  # TODO: copy?
+                    stderrs = [
+                        results[task][stderr]
+                        for task in task_list
+                        if stderr in results[task]
+                    ]
+                    sizes = [
+                        results[task]["samples"]
+                        for task in task_list
+                        if metric in results[task]
+                    ]
+                    # compute group's pooled metric and stderr
+                    results[group][metric] = (
+                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    )
+                    # TODO: calculate grouped metric using aggregation fn
+                    if "N/A" in stderrs:
+                        results[group][stderr] = "N/A"
+                    else:
+                        results[group][stderr] = (
+                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        )
+                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+                    results[group]["samples"] = sum(sizes)
+        results_agg = defaultdict(dict)
+        groups_agg = defaultdict(dict)
+        all_tasks_list = list(task_hierarchy.keys())
+        while True:
+            add_tasks_list = list(k for k in results_agg.keys())
+            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
+            if len(left_tasks_list) == 0:
+                break
+            _task_hierarchy = {
+                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
+            }
+            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
+            results_agg = {**results_agg, **_results_agg}
+            groups_agg = {**groups_agg, **_groups_agg}
+        for group_name, task_list in task_hierarchy.items():
+            if task_list:
+                num_fewshot[group_name] = num_fewshot[
+                    task_list[0]
+                ]  # TODO: validate this
+>>>>>>> mmlu-pro-changes
        results_dict = {
            "results": dict(results_agg.items()),
            **(

--- a/lm_eval/tasks/mmlu_pro/README.md
+++ b/lm_eval/tasks/mmlu_pro/README.md
@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
 #### Groups
 * `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
-* `mmlu_pro_flan_cot_fewshot`: 'mmlu_pro_flan_cot_fewshot includes 5-shot of exemplars for chain-of-thought approach'
-* `mmlu_pro_flan_cot_zeroshot`: 'mmlu_pro_flan_cot_zeroshot evaluates using zero-shot chain-of-thought approach'
-* `mmlu_pro_generative`:  'mmlu_pro_generative solves questions of mmlu_pro using direct (generative) approach'
-* `mmlu_pro_continuation`: 'mmlu_pro_continuation evaluates the ability to continue and complete a given text'
 #### Tasks
 The following tasks evaluate subjects in the mmlu_pro dataset
- `mmlu_pro_{subject_english}`
+- `mmlu_pro_biology`
- `mmlu_pro_flan_cot_fewshot_{subject_english}`
+- `mmlu_pro_business`
- `mmlu_pro_flan_cot_zeroshot_{subject_english}`
+- `mmlu_pro_chemistry`
- `mmlu_pro_generative_{subject_english}`
+- `mmlu_pro_computer_science`
- `mmlu_pro_continuation_{subject_english}`
+- `mmlu_pro_economics`
+- `mmlu_pro_engineering`
+- `mmlu_pro_health`
+- `mmlu_pro_history`
+- `mmlu_pro_law`
+- `mmlu_pro_math`
+- `mmlu_pro_other`
+- `mmlu_pro_philosophy`
+- `mmlu_pro_physics`
+- `mmlu_pro_psychology`
 ### Checklist

--- a/lm_eval/tasks/mmlu_pro/_default_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/_default_template_yaml
+dataset_path: TIGER-Lab/MMLU-Pro
+test_split: test
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+  doc_to_text: !function utils.fewshot_to_text
+  doc_to_target: ""
+output_type: generate_until
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "custom-extract"
+    filter:
+      - function: "regex"
+        regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
+        # regex_pattern: r".*[aA]nswer:\s*([A-J])",
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+num_fewshot: 5
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/mmlu_pro/_generate_configs.py
+++ b/lm_eval/tasks/mmlu_pro/_generate_configs.py
-"""
-Take in a YAML, and output all "other" splits with this YAML
-"""
-import argparse
-import logging
-import os
-import yaml
-from tqdm import tqdm
-eval_logger = logging.getLogger("lm-eval")
-SUBJECTS = {
-    "business": "other",
-    "law": "humanities",
-    "psychology": "social_sciences",
-    "biology": "stem",
-    "chemistry": "stem",
-    "history": "humanities",
-    "other": "other",
-    "health": "other",
-    "economics": "social_sciences",
-    "math": "stem",
-    "physics": "stem",
-    "computer_science": "stem",
-    "philosophy": "humanities",
-    "engineering": "stem"
-}
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument("--save_prefix_path", default="mmlu_pro")
-    parser.add_argument("--cot_prompt_path", default=None)
-    parser.add_argument("--task_prefix", default="")
-    parser.add_argument("--group_prefix", default="")
-    return parser.parse_args()
-if __name__ == "__main__":
-    args = parse_args()
-    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
-    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path, encoding="utf-8") as f:
-        base_yaml = yaml.full_load(f)
-    if args.cot_prompt_path is not None:
-        import json
-        with open(args.cot_prompt_path, encoding="utf-8") as f:
-            cot_file = json.load(f)
-    ALL_CATEGORIES = []
-    for subject, category in tqdm(SUBJECTS.items()):
-        if category not in ALL_CATEGORIES:
-            ALL_CATEGORIES.append(category)
-        if args.cot_prompt_path is not None:
-            description = cot_file[subject]
-        else:
-            description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
-        yaml_dict = {
-            "include": base_yaml_name,
-            "group": f"mmlu_pro_{args.task_prefix}_{category}"
-            if args.task_prefix != ""
-            else f"mmlu_pro_{category}",
-            "group_alias": category.replace("_", " "),
-            "task": f"mmlu_pro_{args.task_prefix}_{subject}"
-            if args.task_prefix != ""
-            else f"mmlu_pro_{subject}",
-            "task_alias": subject.replace("_", " "),
-            "dataset_name": subject,
-            "description": description,
-        }
-        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
-        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
-        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
-            yaml.dump(
-                yaml_dict,
-                yaml_file,
-                allow_unicode=True,
-                default_style='"',
-            )
-    if args.task_prefix != "":
-        mmlu_pro_subcategories = [
-            f"mmlu_pro_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
-        ]
-    else:
-        mmlu_pro_subcategories = [f"mmlu_pro_{category}" for category in ALL_CATEGORIES]
-    if args.group_prefix != "":
-        file_save_path = args.group_prefix + ".yaml"
-    else:
-        file_save_path = args.save_prefix_path + ".yaml"
-    eval_logger.info(f"Saving benchmark config to {file_save_path}")
-    with open(file_save_path, "w", encoding="utf-8") as yaml_file:
-        yaml.dump(
-            {
-                "group": f"mmlu_pro_{args.task_prefix}"
-                if args.task_prefix != ""
-                else "mmlu_pro",
-                "task": mmlu_pro_subcategories,
-            },
-            yaml_file,
-            indent=4,
-            default_flow_style=False,
-        )
--- a/lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
+++ b/lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
-"dataset_name": "health"
-"description": "The following are multiple choice questions (with answers) about health.\n\
-  \n"
-"group": "mmlu_pro_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
-"task": "mmlu_pro_health"
-"task_alias": "health"
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
+description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_biology"
+task_alias: "biology"
+process_docs: !function utils.process_biology
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
+description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_business"
+task_alias: "business"
+process_docs: !function utils.process_business
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
+description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_chemistry"
+task_alias: "chemistry"
+process_docs: !function utils.process_chemistry
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
+description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_computer_science"
+task_alias: "computer_science"
+process_docs: !function utils.process_computer_science
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
+description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_economics"
+task_alias: "economics"
+process_docs: !function utils.process_economics
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
+description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_engineering"
+task_alias: "engineering"
+process_docs: !function utils.process_engineering
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
+description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_health"
+task_alias: "health"
+process_docs: !function utils.process_health
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
+description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_history"
+task_alias: "history"
+process_docs: !function utils.process_history
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
+description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_law"
+task_alias: "law"
+process_docs: !function utils.process_law
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
+description: "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_math"
+task_alias: "math"
+process_docs: !function utils.process_math
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
+description: "The following are multiple choice questions (with answers) about other. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_other"
+task_alias: "other"
+process_docs: !function utils.process_other
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
+description: "The following are multiple choice questions (with answers) about philosophy. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_philosophy"
+task_alias: "philosophy"
+process_docs: !function utils.process_philosophy