merged

17396935 · lintangsutawika · cd8642e7 · 458342e2 · 17396935 · 17396935
Commit 17396935 authored Aug 05, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
+from functools import partial
+
 import datasets


@@ -15,9 +17,38 @@ class ContextSampler:
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter

-        self.doc_to_text = self.task.doc_to_text
-        self.doc_to_target = self.task.doc_to_target
-        self.doc_to_choice = self.task.doc_to_choice
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
+            self.doc_to_text = self.task.doc_to_text
+
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
+            self.doc_to_target = self.task.doc_to_target
+
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
+            self.doc_to_choice = self.task.doc_to_choice

        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices:  # subset few-shot docs from
@@ -52,6 +83,7 @@ class ContextSampler:
                else self.doc_to_choice(doc)[doc_content]
            )
            labeled_examples += self.target_delimiter
+<<<<<<< HEAD
            labeled_examples += (
                str(doc_target[0])
                if isinstance(doc_target, list)
@@ -60,6 +92,17 @@ class ContextSampler:
                else str(self.doc_to_choice(doc)[doc_target])
            )
            labeled_examples += self.fewshot_delimiter
+=======
+            if doc_target != "":
+                labeled_examples += (
+                    str(doc_target[0])
+                    if isinstance(doc_target, list)
+                    else doc_target
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
+>>>>>>> mmlu-pro-changes

        return labeled_examples


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task):
        """
        return doc

-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc, doc_to_text=None):
        if self.prompt is not None:
            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
        else:
            doc_to_text = self.config.doc_to_text

@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task):
            print(type(doc_to_text))
            raise TypeError

-    def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
        if self.prompt is not None:
            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
        else:
            doc_to_target = self.config.doc_to_target

@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task):
        else:
            raise TypeError

-    def doc_to_choice(self, doc: Any) -> List[str]:
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -607,6 +607,78 @@ def evaluate(
                            _higher_is_better[m] = None
                higher_is_better[group] = _higher_is_better

+<<<<<<< HEAD
+=======
+                # collect all metric keys used by a subtask in the group.
+                metric_list = list(
+                    {
+                        key
+                        for task in task_list
+                        for key in results[task].keys()
+                        if "_stderr" not in key and key not in ["alias", "samples"]
+                    }
+                )
+                for metric in metric_list:
+                    stderr = "_stderr,".join(metric.split(","))
+
+                    # gather metrics, sizes, and stderrs from subtasks
+                    metrics = [
+                        results[task][metric]
+                        for task in task_list
+                        if metric in results[task]
+                    ]  # TODO: copy?
+                    stderrs = [
+                        results[task][stderr]
+                        for task in task_list
+                        if stderr in results[task]
+                    ]
+                    sizes = [
+                        results[task]["samples"]
+                        for task in task_list
+                        if metric in results[task]
+                    ]
+
+                    # compute group's pooled metric and stderr
+                    results[group][metric] = (
+                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    )
+                    # TODO: calculate grouped metric using aggregation fn
+                    if "N/A" in stderrs:
+                        results[group][stderr] = "N/A"
+                    else:
+                        results[group][stderr] = (
+                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        )
+                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+
+                    results[group]["samples"] = sum(sizes)
+
+        results_agg = defaultdict(dict)
+        groups_agg = defaultdict(dict)
+        all_tasks_list = list(task_hierarchy.keys())
+        while True:
+            add_tasks_list = list(k for k in results_agg.keys())
+            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
+            if len(left_tasks_list) == 0:
+                break
+
+            _task_hierarchy = {
+                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
+            }
+            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
+
+            results_agg = {**results_agg, **_results_agg}
+            groups_agg = {**groups_agg, **_groups_agg}
+
+        for group_name, task_list in task_hierarchy.items():
+            if task_list:
+                num_fewshot[group_name] = num_fewshot[
+                    task_list[0]
+                ]  # TODO: validate this
+
+>>>>>>> mmlu-pro-changes
        results_dict = {
            "results": dict(results_agg.items()),
            **(

--- a/lm_eval/tasks/mmlu_pro/README.md
+++ b/lm_eval/tasks/mmlu_pro/README.md
@@ -17,7 +17,7 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s

 ```bibtex
 @misc{wang2024mmlupro,
-      title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark}, 
+      title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark},
      author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
      year={2024},
      eprint={2406.01574},
@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
 #### Groups

 * `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
-* `mmlu_pro_flan_cot_fewshot`: 'mmlu_pro_flan_cot_fewshot includes 5-shot of exemplars for chain-of-thought approach'
-* `mmlu_pro_flan_cot_zeroshot`: 'mmlu_pro_flan_cot_zeroshot evaluates using zero-shot chain-of-thought approach'
-* `mmlu_pro_generative`:  'mmlu_pro_generative solves questions of mmlu_pro using direct (generative) approach'
-* `mmlu_pro_continuation`: 'mmlu_pro_continuation evaluates the ability to continue and complete a given text'

 #### Tasks

 The following tasks evaluate subjects in the mmlu_pro dataset
- `mmlu_pro_{subject_english}`
- `mmlu_pro_flan_cot_fewshot_{subject_english}`
- `mmlu_pro_flan_cot_zeroshot_{subject_english}`
- `mmlu_pro_generative_{subject_english}`
- `mmlu_pro_continuation_{subject_english}`
+- `mmlu_pro_biology`
+- `mmlu_pro_business`
+- `mmlu_pro_chemistry`
+- `mmlu_pro_computer_science`
+- `mmlu_pro_economics`
+- `mmlu_pro_engineering`
+- `mmlu_pro_health`
+- `mmlu_pro_history`
+- `mmlu_pro_law`
+- `mmlu_pro_math`
+- `mmlu_pro_other`
+- `mmlu_pro_philosophy`
+- `mmlu_pro_physics`
+- `mmlu_pro_psychology`

 ### Checklist


--- a/lm_eval/tasks/mmlu_pro/_default_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/_default_template_yaml
+dataset_path: TIGER-Lab/MMLU-Pro
+test_split: test
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+  doc_to_text: !function utils.fewshot_to_text
+  doc_to_target: ""
+output_type: generate_until
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "custom-extract"
+    filter:
+      - function: "regex"
+        regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
+        # regex_pattern: r".*[aA]nswer:\s*([A-J])",
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+num_fewshot: 5
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/mmlu_pro/_generate_configs.py
+++ b/lm_eval/tasks/mmlu_pro/_generate_configs.py
-"""
-Take in a YAML, and output all "other" splits with this YAML
-"""
-import argparse
-import logging
-import os
-
-import yaml
-from tqdm import tqdm
-
-
-eval_logger = logging.getLogger("lm-eval")
-
-
-SUBJECTS = {
-    "business": "other",
-    "law": "humanities",
-    "psychology": "social_sciences",
-    "biology": "stem",
-    "chemistry": "stem",
-    "history": "humanities",
-    "other": "other",
-    "health": "other",
-    "economics": "social_sciences",
-    "math": "stem",
-    "physics": "stem",
-    "computer_science": "stem",
-    "philosophy": "humanities",
-    "engineering": "stem"
-}
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument("--save_prefix_path", default="mmlu_pro")
-    parser.add_argument("--cot_prompt_path", default=None)
-    parser.add_argument("--task_prefix", default="")
-    parser.add_argument("--group_prefix", default="")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
-    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path, encoding="utf-8") as f:
-        base_yaml = yaml.full_load(f)
-
-    if args.cot_prompt_path is not None:
-        import json
-
-        with open(args.cot_prompt_path, encoding="utf-8") as f:
-            cot_file = json.load(f)
-
-    ALL_CATEGORIES = []
-    for subject, category in tqdm(SUBJECTS.items()):
-        if category not in ALL_CATEGORIES:
-            ALL_CATEGORIES.append(category)
-
-        if args.cot_prompt_path is not None:
-            description = cot_file[subject]
-        else:
-            description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
-
-        yaml_dict = {
-            "include": base_yaml_name,
-            "group": f"mmlu_pro_{args.task_prefix}_{category}"
-            if args.task_prefix != ""
-            else f"mmlu_pro_{category}",
-            "group_alias": category.replace("_", " "),
-            "task": f"mmlu_pro_{args.task_prefix}_{subject}"
-            if args.task_prefix != ""
-            else f"mmlu_pro_{subject}",
-            "task_alias": subject.replace("_", " "),
-            "dataset_name": subject,
-            "description": description,
-        }
-
-        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
-        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
-        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
-            yaml.dump(
-                yaml_dict,
-                yaml_file,
-                allow_unicode=True,
-                default_style='"',
-            )
-
-    if args.task_prefix != "":
-        mmlu_pro_subcategories = [
-            f"mmlu_pro_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
-        ]
-    else:
-        mmlu_pro_subcategories = [f"mmlu_pro_{category}" for category in ALL_CATEGORIES]
-
-    if args.group_prefix != "":
-        file_save_path = args.group_prefix + ".yaml"
-    else:
-        file_save_path = args.save_prefix_path + ".yaml"
-
-    eval_logger.info(f"Saving benchmark config to {file_save_path}")
-    with open(file_save_path, "w", encoding="utf-8") as yaml_file:
-        yaml.dump(
-            {
-                "group": f"mmlu_pro_{args.task_prefix}"
-                if args.task_prefix != ""
-                else "mmlu_pro",
-                "task": mmlu_pro_subcategories,
-            },
-            yaml_file,
-            indent=4,
-            default_flow_style=False,
-        )
--- a/lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
+++ b/lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
-"dataset_name": "health"
-"description": "The following are multiple choice questions (with answers) about health.\n\
-  \n"
-"group": "mmlu_pro_other"
-"group_alias": "other"
-"include": "_default_template_yaml"
-"task": "mmlu_pro_health"
-"task_alias": "health"
--- a/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
+++ b/lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
+description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_biology"
+task_alias: "biology"
+process_docs: !function utils.process_biology
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
+description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_business"
+task_alias: "business"
+process_docs: !function utils.process_business
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
+description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_chemistry"
+task_alias: "chemistry"
+process_docs: !function utils.process_chemistry
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
+description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_computer_science"
+task_alias: "computer_science"
+process_docs: !function utils.process_computer_science
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
+description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_economics"
+task_alias: "economics"
+process_docs: !function utils.process_economics
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
+description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_engineering"
+task_alias: "engineering"
+process_docs: !function utils.process_engineering
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
+description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_health"
+task_alias: "health"
+process_docs: !function utils.process_health
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
+description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_history"
+task_alias: "history"
+process_docs: !function utils.process_history
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
+description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_law"
+task_alias: "law"
+process_docs: !function utils.process_law
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
+description: "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_math"
+task_alias: "math"
+process_docs: !function utils.process_math
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
+description: "The following are multiple choice questions (with answers) about other. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_other"
+task_alias: "other"
+process_docs: !function utils.process_other
--- a/lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
+++ b/lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
+description: "The following are multiple choice questions (with answers) about philosophy. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_philosophy"
+task_alias: "philosophy"
+process_docs: !function utils.process_philosophy