Merge pull request #864 from EleutherAI/add-fewshot-config

[Refactor] CMMLU, C-Eval port ; Add fewshot config

Merge pull request #864 from EleutherAI/add-fewshot-config
[Refactor] CMMLU, C-Eval port ; Add fewshot config
bdddfec2 · Hailey Schoelkopf · GitHub · 0f6cd358 · f88ffeee · bdddfec2
Unverified Commit bdddfec2 authored Sep 21, 2023 by Hailey Schoelkopf Committed by GitHub Sep 21, 2023
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,9 +40,7 @@ repos:
      - id: codespell
        exclude: >
          (?x)^(
-          .*\.json
-          |ignore.txt
-          |lm_eval/tasks/.*
+              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
  - repo: https://github.com/pre-commit/mirrors-mypy

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
-class Sampler:
+class ContextSampler:
    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"
@@ -71,7 +71,19 @@ class Sampler:
        return self.rnd.sample(self.docs, n)


-class BalancedSampler(Sampler):
+class FirstNSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert n <= len(
+            self.docs
+        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        return self.docs[:n]
+
+
+class BalancedSampler(ContextSampler):
    def sample(self, n) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
@@ -81,12 +93,27 @@ class BalancedSampler(Sampler):
        pass


-class ManualSampler(Sampler):
+class ManualSampler(ContextSampler):
    def sample(self, n) -> None:
        """ """
        pass


+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+
+
+def get_sampler(name):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )
+
+
 # TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
 # Depends what's easier for new user to add own functionality on top of


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -75,6 +75,7 @@ class TaskConfig(dict):
    description: str = ""
    target_delimiter: str = " "
    fewshot_delimiter: str = "\n\n"
+    fewshot_config: dict = None
    # runtime configuration options
    num_fewshot: int = 0
    # scoring options
@@ -629,9 +630,9 @@ class ConfigurableTask(Task):
            self.prompt = None

        if self.fewshot_docs() is not None:
-            self.sampler = samplers.Sampler(
-                list(self.fewshot_docs()), self, rnd=random.Random(1234)
-            )
+            self.sampler = samplers.get_sampler(
+                self.config.fewshot_config.get("sampler", "default")
+            )(list(self.fewshot_docs()), self, rnd=random.Random(1234))

        if self.has_test_docs():
            self.task_docs = self.test_docs()

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -469,10 +469,10 @@ def evaluate(

            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
-            if False:  # bootstrap_iters > 0:
+            if bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
-                    bootstrap_iters=min(bootstrap_iters, 1000)
+                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )

--- a/lm_eval/tasks/ceval/README.md
+++ b/lm_eval/tasks/ceval/README.md
+# C-Eval (Validation)
+
+### Paper
+C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models
+https://arxiv.org/pdf/2305.08322.pdf
+
+C-Eval is a comprehensive Chinese evaluation suite for foundation models.
+It consists of 13948 multi-choice questions spanning 52 diverse disciplines
+and four difficulty levels.
+
+Homepage: https://cevalbenchmark.com/
+
+### Citation
+
+```bibtex
+@article{huang2023ceval,
+    title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
+    author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
+    journal={arXiv preprint arXiv:2305.08322},
+    year={2023}
+}
+```
+
+
+SUBJECTS = {
+    "computer_network":"计算机网络",
+    "operating_system":"操作系统",
+    "computer_architecture":"计算机组成",
+    "college_programming":"大学编程",
+    "college_physics":"大学物理",
+    "college_chemistry":"大学化学",
+    "advanced_mathematics":"高等数学",
+    "probability_and_statistics":"概率统计",
+    "discrete_mathematics":"离散数学",
+    "electrical_engineer":"注册电气工程师",
+    "metrology_engineer":"注册计量师",
+    "high_school_mathematics":"高中数学",
+    "high_school_physics":"高中物理",
+    "high_school_chemistry":"高中化学",
+    "high_school_biology":"高中生物",
+    "middle_school_mathematics":"初中数学",
+    "middle_school_biology":"初中生物",
+    "middle_school_physics":"初中物理",
+    "middle_school_chemistry":"初中化学",
+    "veterinary_medicine":"兽医学",
+    "college_economics":"大学经济学",
+    "business_administration":"工商管理",
+    "marxism":"马克思主义基本原理",
+    "mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
+    "education_science":"教育学",
+    "teacher_qualification":"教师资格",
+    "high_school_politics":"高中政治",
+    "high_school_geography":"高中地理",
+    "middle_school_politics":"初中政治",
+    "middle_school_geography":"初中地理",
+    "modern_chinese_history":"近代史纲要",
+    "ideological_and_moral_cultivation":"思想道德修养与法律基础",
+    "logic":"逻辑学",
+    "law":"法学",
+    "chinese_language_and_literature":"中国语言文学",
+    "art_studies":"艺术学",
+    "professional_tour_guide":"导游资格",
+    "legal_professional":"法律职业资格",
+    "high_school_chinese":"高中语文",
+    "high_school_history":"高中历史",
+    "middle_school_history":"初中历史",
+    "civil_servant":"公务员",
+    "sports_science":"体育学",
+    "plant_protection":"植物保护",
+    "basic_medicine":"基础医学",
+    "clinical_medicine":"临床医学",
+    "urban_and_rural_planner":"注册城乡规划师",
+    "accountant":"注册会计师",
+    "fire_engineer":"注册消防工程师",
+    "environmental_impact_assessment_engineer":"环境影响评价工程师",
+    "tax_accountant":"税务师",
+    "physician":"医师资格"
+}
+
+
+# CMMLU
+
+### Paper
+
+CMMLU: Measuring massive multitask language understanding in Chinese
+https://arxiv.org/abs/2306.09212
+
+CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture.
+CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels.
+
+Homepage: https://github.com/haonan-li/CMMLU
+
+### Citation
+
+```bibtex
+@misc{li2023cmmlu,
+      title={CMMLU: Measuring massive multitask language understanding in Chinese},
+      author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
+      year={2023},
+      eprint={2306.09212},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `ceval-valid`: All 52 subjects of the C-Eval dataset, evaluated following the methodology in MMLU's original implementation. This implementation consists solely of the validation set of C-Eval, as the test set requires submission of model predictions to an external site.
+
+#### Tasks
+
+
+The following tasks evaluate subjects in the C-Eval dataset using loglikelihood-based multiple-choice scoring:
+- `ceval-valid_{subject_english}`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
+group: ceval-valid
+dataset_path: ceval/ceval-exam
+validation_split: val
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: "1.0"
--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+
+from tqdm import tqdm
+
+from lm_eval.logger import eval_logger
+
+SUBJECTS = {
+    "computer_network": "计算机网络",
+    "operating_system": "操作系统",
+    "computer_architecture": "计算机组成",
+    "college_programming": "大学编程",
+    "college_physics": "大学物理",
+    "college_chemistry": "大学化学",
+    "advanced_mathematics": "高等数学",
+    "probability_and_statistics": "概率统计",
+    "discrete_mathematics": "离散数学",
+    "electrical_engineer": "注册电气工程师",
+    "metrology_engineer": "注册计量师",
+    "high_school_mathematics": "高中数学",
+    "high_school_physics": "高中物理",
+    "high_school_chemistry": "高中化学",
+    "high_school_biology": "高中生物",
+    "middle_school_mathematics": "初中数学",
+    "middle_school_biology": "初中生物",
+    "middle_school_physics": "初中物理",
+    "middle_school_chemistry": "初中化学",
+    "veterinary_medicine": "兽医学",
+    "college_economics": "大学经济学",
+    "business_administration": "工商管理",
+    "marxism": "马克思主义基本原理",
+    "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
+    "education_science": "教育学",
+    "teacher_qualification": "教师资格",
+    "high_school_politics": "高中政治",
+    "high_school_geography": "高中地理",
+    "middle_school_politics": "初中政治",
+    "middle_school_geography": "初中地理",
+    "modern_chinese_history": "近代史纲要",
+    "ideological_and_moral_cultivation": "思想道德修养与法律基础",
+    "logic": "逻辑学",
+    "law": "法学",
+    "chinese_language_and_literature": "中国语言文学",
+    "art_studies": "艺术学",
+    "professional_tour_guide": "导游资格",
+    "legal_professional": "法律职业资格",
+    "high_school_chinese": "高中语文",
+    "high_school_history": "高中历史",
+    "middle_school_history": "初中历史",
+    "civil_servant": "公务员",
+    "sports_science": "体育学",
+    "plant_protection": "植物保护",
+    "basic_medicine": "基础医学",
+    "clinical_medicine": "临床医学",
+    "urban_and_rural_planner": "注册城乡规划师",
+    "accountant": "注册会计师",
+    "fire_engineer": "注册消防工程师",
+    "environmental_impact_assessment_engineer": "环境影响评价工程师",
+    "tax_accountant": "税务师",
+    "physician": "医师资格",
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="ceval-valid")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+
+    for subject_eng, subject_zh in tqdm(SUBJECTS.items()):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject_eng]
+        else:
+            description = f"以下是中国关于{subject_zh}的单项选择题，请选出其中的正确答案。\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"ceval-valid_{args.task_prefix}_{subject_eng}"
+            if args.task_prefix != ""
+            else f"ceval-valid_{subject_eng}",
+            "dataset_name": subject_eng,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/lm_eval/tasks/ceval/ceval-valid_accountant.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_accountant.yaml
+"dataset_name": "accountant"
+"description": "以下是中国关于注册会计师的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_accountant"
--- a/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml
+"dataset_name": "advanced_mathematics"
+"description": "以下是中国关于高等数学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_advanced_mathematics"
--- a/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml
+"dataset_name": "art_studies"
+"description": "以下是中国关于艺术学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_art_studies"
--- a/lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml
+"dataset_name": "basic_medicine"
+"description": "以下是中国关于基础医学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_basic_medicine"
--- a/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml
+"dataset_name": "business_administration"
+"description": "以下是中国关于工商管理的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_business_administration"
--- a/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml
+"dataset_name": "chinese_language_and_literature"
+"description": "以下是中国关于中国语言文学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_chinese_language_and_literature"
--- a/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml
+"dataset_name": "civil_servant"
+"description": "以下是中国关于公务员的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_civil_servant"
--- a/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml
+"dataset_name": "clinical_medicine"
+"description": "以下是中国关于临床医学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_clinical_medicine"
--- a/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml
+"dataset_name": "college_chemistry"
+"description": "以下是中国关于大学化学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_college_chemistry"
--- a/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml
+"dataset_name": "college_economics"
+"description": "以下是中国关于大学经济学的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_college_economics"
--- a/lm_eval/tasks/ceval/ceval-valid_college_physics.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_college_physics.yaml
+"dataset_name": "college_physics"
+"description": "以下是中国关于大学物理的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_college_physics"
--- a/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml
+"dataset_name": "college_programming"
+"description": "以下是中国关于大学编程的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_college_programming"
--- a/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml
+++ b/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml
+"dataset_name": "computer_architecture"
+"description": "以下是中国关于计算机组成的单项选择题，请选出其中的正确答案。\n\n"
+"include": "_default_ceval_yaml"
+"task": "ceval-valid_computer_architecture"