Unverified Commit 65546905 authored by Haonan Li's avatar Haonan Li Committed by GitHub
Browse files

Add ACLUE task (#1614)

* Add task ACLUE

* fix minor bug

* fix code style

* fix code style
parent c7b03ad4
# ACLUE
### Paper
Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE
https://arxiv.org/abs/2310.09550
The Ancient Chinese Language Understanding Evaluation (ACLUE) is an evaluation benchmark focused on ancient Chinese language comprehension. It aims to assess the performance of large-scale language models on understanding ancient Chinese. The benchmark comprises 15 tasks spanning various domains, including lexical, syntactic, semantic, inference, and knowledge. ACLUE's tasks are derived from a combination of manually curated questions from publicly available resources, and automatically
generated questions from classical Chinese language corpora. The range of questions span from the Xia dynasty (2070 BCE) to the Ming dynasty (1368 CE). ACLUE adopts a multiple-choice question format for all tasks.
Homepage: https://github.com/isen-zhang/ACLUE
### Citation
```bibtex
@inproceedings{zhang-li-2023-large,
title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
author = "Zhang, Yixuan and Li, Haonan",
booktitle = "Proceedings of the Ancient Language Processing Workshop",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2023.alp-1.9",
pages = "80--87"
}
```
### Groups and Tasks
#### Groups
- `aclue`: All 15 subjects of the ACLUE dataset, evaluated following the methodology in CMMLU's original implementation.
#### Tasks
The following tasks evaluate subjects in the ACLUE dataset using loglikelihood-based multiple-choice scoring:
- `aclue_{subject_english}`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: aclue
dataset_path: tyouisen/aclue
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import yaml
from tqdm import tqdm
from lm_eval.utils import eval_logger
SUBJECTS = {
"古文单字多义": "polysemy_resolution",
"诗词情感分类": "poetry_sentiment_analysis",
"古汉语命名体识别": "named_entity_recognition",
"古汉语知识": "basic_ancient_chinese",
"古诗词上下句预测": "poetry_context_prediction",
"古文断句": "sentence_segmentation",
"对联": "couplet_prediction",
"古诗词曲鉴赏": "poetry_appreciate",
"国学常识": "ancient_chinese_culture",
"古音学": "ancient_phonetics",
"通假字": "homographic_character_resolution",
"古代文学知识": "ancient_literature",
"医古文": "ancient_medical",
"古诗词质量评估": "poetry_quality_assessment",
"古文阅读理解": "reading_comprehension",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="aclue")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
for subject_zh, subject_eng in tqdm(SUBJECTS.items()):
if args.cot_prompt_path is not None:
description = cot_file[subject_eng]
else:
description = (
f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n"
)
yaml_dict = {
"include": base_yaml_name,
"task": f"aclue_{args.task_prefix}_{subject_eng}"
if args.task_prefix != ""
else f"aclue_{subject_eng}",
"dataset_name": subject_eng,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "ancient_chinese_culture"
"description": "以下是关于国学常识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_chinese_culture"
"dataset_name": "ancient_literature"
"description": "以下是关于古代文学知识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_literature"
"dataset_name": "ancient_medical"
"description": "以下是关于医古文的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_medical"
"dataset_name": "ancient_phonetics"
"description": "以下是关于古音学的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_ancient_phonetics"
"dataset_name": "basic_ancient_chinese"
"description": "以下是关于古汉语知识的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_basic_ancient_chinese"
"dataset_name": "couplet_prediction"
"description": "以下是关于对联的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_couplet_prediction"
"dataset_name": "homographic_character_resolution"
"description": "以下是关于通假字的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_homographic_character_resolution"
"dataset_name": "named_entity_recognition"
"description": "以下是关于古汉语命名体识别的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_named_entity_recognition"
"dataset_name": "poetry_appreciate"
"description": "以下是关于古诗词曲鉴赏的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_poetry_appreciate"
"dataset_name": "poetry_context_prediction"
"description": "以下是关于古诗词上下句预测的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_poetry_context_prediction"
"dataset_name": "poetry_quality_assessment"
"description": "以下是关于古诗词质量评估的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_poetry_quality_assessment"
"dataset_name": "poetry_sentiment_analysis"
"description": "以下是关于诗词情感分类的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_poetry_sentiment_analysis"
"dataset_name": "polysemy_resolution"
"description": "以下是关于古文单字多义的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_polysemy_resolution"
"dataset_name": "reading_comprehension"
"description": "以下是关于古文阅读理解的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_reading_comprehension"
"dataset_name": "sentence_segmentation"
"description": "以下是关于古文断句的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "aclue_sentence_segmentation"
......@@ -7,7 +7,7 @@ import os
import yaml
from tqdm import tqdm
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
SUBJECTS = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment