add taiwan truthful qa

1ce8c97a · Yen-Ting Lin · 7fe2b93c · 1ce8c97a · 1ce8c97a · 1ce8c97a
Commit 1ce8c97a authored May 07, 2024 by Yen-Ting Lin
7 changed files
--- a/lm_eval/tasks/twllm_eval/README.md
+++ b/lm_eval/tasks/twllm_eval/README.md
+# TMMLU+
+### Paper
+Title: `An Improved Traditional Chinese Evaluation Suite for Foundation Model`
+Abstract: `We present TMMLU+, a comprehensive dataset designed for the Traditional Chinese massive multitask language understanding dataset. TMMLU+ is a multiple-choice question-answering dataset with 66 subjects from elementary to professional level. Compared to its predecessor, TMMLU, TMMLU+ is six times larger and boasts a more balanced subject distribution. We included benchmark results in TMMLU+ from closed-source models and 24 open-weight Chinese large language models of parameters ranging from 1.8B to 72B. Our findings reveal that Traditional Chinese models still trail behind their Simplified Chinese counterparts. Additionally, current large language models have yet to outperform human performance in average scores. We publicly release our dataset and the corresponding benchmark source code.`
+Homepage: [https://huggingface.co/datasets/ikala/tmmluplus](https://huggingface.co/datasets/ikala/tmmluplus)
+### Citation
+```
+@article{ikala2024improved,
+  title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
+  author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
+  journal={arXiv preprint arXiv:2403.01858},
+  year={2024}
+}
+```
+### Groups and Tasks
+#### Groups
+* `tmmluplus`: `The dataset comprises 22,690 multiple-choice questions from 66 subjects ranging from primary to professional level. `
+#### Tasks
+The following tasks evaluate subjects in the TMMLU+ dataset using loglikelihood-based multiple-choice scoring:
+* `tmmluplus_{subject_english}`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/twllm_eval/default/_default_template_yaml
+++ b/lm_eval/tasks/twllm_eval/default/_default_template_yaml
+dataset_path: yentinglin/pegatron_benchmark_multiple_choice
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.1
--- a/lm_eval/tasks/twllm_eval/default/_generate_configs.py
+++ b/lm_eval/tasks/twllm_eval/default/_generate_configs.py
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+import argparse
+import os
+import pandas as pd
+import yaml
+from tqdm import tqdm
+categories = {
+    "localization": [
+        "tw_truthful_qa",
+    ],
+}
+task_list = [
+    "tw_truthful_qa",
+]
+subject2name = {}
+# subject2category = {}
+SUBJECTS = {}
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", default="_default_template_yaml")
+    parser.add_argument("--save_prefix_path", default="twllm_eval")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    parser.add_argument("--group_prefix", default="")
+    parser.add_argument("--subject_file", default="subject.tsv")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    from pathlib import Path
+    # Initialization
+    SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
+    df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
+    for _, row in df.iterrows():
+        for _c in categories:
+            if row["subject"] in SUBJECTS:
+                raise ValueError("Duplicate tasks.")
+            if row["category"] in categories[_c]:  # append new item into SUBJECTS
+                SUBJECTS[row["subject"]] = _c
+                subject2name[row["subject"]] = row["name"]
+                break
+    # End of SUBJECTS initialization
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+    if args.cot_prompt_path is not None:
+        import json
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+    ALL_CATEGORIES = []
+    for subject, category in tqdm(SUBJECTS.items()):
+        if category not in ALL_CATEGORIES:
+            ALL_CATEGORIES.append(category)
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject]
+        else:
+            name_of_subject = subject2name[subject].replace("＿", " ")
+            description = f"以下為{name_of_subject}的單選題，請提供正確答案的選項。\n\n"
+        yaml_dict = {
+            "include": base_yaml_name,
+            "group": f"twllm_eval_{args.task_prefix}_{category}"
+            if args.task_prefix != ""
+            else f"twllm_eval_{category}",
+            "group_alias": category.replace("_", " "),
+            "task": f"twllm_eval_{args.task_prefix}_{subject}"
+            if args.task_prefix != ""
+            else f"twllm_eval_{subject}",
+            "task_alias": subject.replace("_", " "),
+            "dataset_name": subject,
+            "description": description,
+        }
+        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
+        # eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                # width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
+    if args.task_prefix != "":
+        mmlu_subcategories = [
+            f"twllm_eval_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
+        ]
+    else:
+        mmlu_subcategories = [f"twllm_eval_{category}" for category in ALL_CATEGORIES]
+    if args.group_prefix != "":
+        file_save_path = args.group_prefix + ".yaml"
+    else:
+        file_save_path = args.save_prefix_path + ".yaml"
+    # eval_logger.info(f"Saving benchmark config to {file_save_path}")
+    with open(file_save_path, "w") as yaml_file:
+        yaml.dump(
+            {
+                "group": f"twllm_eval_{args.task_prefix}"
+                if args.task_prefix != ""
+                else "twllm_eval",
+                "task": mmlu_subcategories,
+            },
+            yaml_file,
+            indent=4,
+            default_flow_style=False,
+        )
--- a/lm_eval/tasks/twllm_eval/default/subject.tsv
+++ b/lm_eval/tasks/twllm_eval/default/subject.tsv
+subject	name	category
+tw_truthful_qa	台灣在地化	tw_truthful_qa
--- a/lm_eval/tasks/twllm_eval/default/twllm_eval.yaml
+++ b/lm_eval/tasks/twllm_eval/default/twllm_eval.yaml
+group: twllm_eval
+task:
+- twllm_eval_localization
--- a/lm_eval/tasks/twllm_eval/default/twllm_eval_tw_truthful_qa.yaml
+++ b/lm_eval/tasks/twllm_eval/default/twllm_eval_tw_truthful_qa.yaml
+"dataset_name": "tw_truthful_qa"
+"description": "以下為台灣在地化的單選題，請提供正確答案的選項。\n\n"
+"group": "twllm_eval_localization"
+"group_alias": "localization"
+"include": "_default_template_yaml"
+"task": "twllm_eval_tw_truthful_qa"
+"task_alias": "tw truthful qa"
--- a/lm_eval/tasks/twllm_eval/default/utils.py
+++ b/lm_eval/tasks/twllm_eval/default/utils.py
+import datasets
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _helper(doc):
+        # modifies the contents of a single
+        # document in our dataset.
+        answer_list = ["A", "B", "C", "D"]
+        out_doc = {
+            "questions": doc["question"],
+            "choices": [doc["A"], doc["B"], doc["C"], doc["D"]],
+            "goal": answer_list.index(doc["answer"]),
+        }
+        return out_doc
+    return dataset.map(_helper)  # returns back a datasets.Dataset object