Commit 3e5e9da2 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged from main

parents d429b47f 7852985b
# Generated by utils.py
dataset_name: eu_osakidetza5e
include: eus_exams_eu
task: eus_exams_eu_osakidetza5e
# Generated by utils.py
dataset_name: eu_osakidetza6e
include: eus_exams_eu
task: eus_exams_eu_osakidetza6e
# Generated by utils.py
dataset_name: eu_osakidetza7e
include: eus_exams_eu
task: eus_exams_eu_osakidetza7e
import datasets
def process_docs(dataset: datasets.Dataset):
"""Filter out examples with no answer."""
def valid_example(example: dict) -> bool:
"""Check if an example is valid."""
if example["answer"] not in [0, 1, 2, 3]:
return False
if example["candidates"] == ["", "", "", ""]:
return False
return True
return dataset.filter(valid_example)
# EusProficiency
### Paper
Title: Latxa: An Open Language Model and Evaluation Suite for Basque
Abstract: https://arxiv.org/abs/2403.20266
EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. We collected the atarikoa exercises from EGA exams through the years 1998 to 2008. Atarikoa is the first qualifying test of EGA, which measures different aspects of language competency, such as reading comprehension, grammar, vocabulary, spelling, and writing. Each test generally has 85 multiple-choice questions, with 4 choices and a single correct answer.
Homepage: https://github.com/hitz-zentroa/latxa
### Citation
```
@misc{etxaniz2024latxa,
title={Latxa: An Open Language Model and Evaluation Suite for Basque},
author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
year={2024},
eprint={2403.20266},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
There are no groups.
#### Tasks
* `eus_proficiency`: EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque.
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: HiTZ/EusProficiency
dataset_name: default
task: eus_proficiency
doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:"
doc_to_choice: ["A", "B", "C", "D"]
validation_split: null
test_split: test
fewshot_split: test
output_type: multiple_choice
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
# EusReading
### Paper
Title: Latxa: An Open Language Model and Evaluation Suite for Basque
Abstract: https://arxiv.org/abs/2403.20266
EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008. Each test generally has 10 multiple-choice questions, with 4 choices and a single correct answer. These exercises are more challenging than Belebele due to the complexity and length of the input texts. As a result, EusReading is useful to measure long context understanding of models.
Homepage: https://github.com/hitz-zentroa/latxa
### Citation
```
@misc{etxaniz2024latxa,
title={Latxa: An Open Language Model and Evaluation Suite for Basque},
author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
year={2024},
eprint={2403.20266},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
There are no groups.
#### Tasks
* `eus_reading`: EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008.
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: HiTZ/EusReading
dataset_name: default
task: eus_reading
doc_to_text: !function utils.doc_to_text_context
doc_to_choice: !function utils.doc_to_choice
validation_split: null
test_split: test
fewshot_split: test
output_type: multiple_choice
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
from typing import List
letters = ["A", "B", "C", "D"]
def doc_to_text_context(doc) -> str:
"""
Converts a document to a formatted string.
Args:
doc (dict): A dictionary containing the document information.
Returns:
str: A formatted string containing the question and answer choices.
"""
candidates = doc["candidates"]
num_choices = len(candidates)
if num_choices < 2:
raise ValueError("Invalid number of candidates")
choices = letters[:num_choices]
formatted_choices = "\n".join(
[f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)]
)
return f"Pasartea: {doc['context']}\n\nGaldera: {doc['question']}\n{formatted_choices}\nErantzuna:"
def doc_to_choice(doc) -> List[str]:
"""
Returns the answer choices for a document.
Args:
doc (dict): A dictionary containing the document information.
Returns:
list: A list of strings containing the answer choices.
"""
num_choices = len(doc["candidates"])
if num_choices < 2:
raise ValueError("Invalid number of candidates")
return letters[:num_choices]
# EusTrivia
### Paper
Title: Latxa: An Open Language Model and Evaluation Suite for Basque
Abstract: https://arxiv.org/abs/2403.20266
EusTrivia consists of 1,715 trivia questions from multiple online sources. 56.3\% of the questions are elementary level (grades 3-6), while the rest are considered challenging. A significant portion of the questions focus specifically on the Basque Country, its language and culture. Each multiple-choice question contains two, three or four choices (3.84 on average) and a single correct answer. Five areas of knowledge are covered:
- **Humanities and Natural Sciences** (27.8%): This category encompasses questions about history, geography, biology, ecology and other social and natural sciences.
- **Leisure and Art** (24.5%): This category includes questions on sports and athletes, performative and plastic arts and artists, architecture, cultural events, and related topics.
- **Music** (16.0%): Here are grouped all the questions about music and musicians, both classical and contemporary.
- **Language and Literature** (17.1%): This category is concerned with all kinds of literature productions and writers, as well as metalinguistic questions (e.g., definitions, synonyms, and word usage).
- **Mathematics and ICT** (14.5%): This category covers mathematical problems and questions about ICT, as well as questions about people known for their contributions to these fields of knowledge.
Homepage: https://github.com/hitz-zentroa/latxa
### Citation
```
@misc{etxaniz2024latxa,
title={Latxa: An Open Language Model and Evaluation Suite for Basque},
author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
year={2024},
eprint={2403.20266},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
There are no groups.
#### Tasks
* `eus_trivia`: EusTrivia consists of 1,715 trivia questions from multiple online sources.
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: HiTZ/EusTrivia
dataset_name: default
task: eus_trivia
doc_to_text: !function utils.doc_to_text
doc_to_choice: !function utils.doc_to_choice
validation_split: null
test_split: test
fewshot_split: test
output_type: multiple_choice
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
from typing import List
letters = ["A", "B", "C", "D"]
def doc_to_text(doc) -> str:
"""
Converts a document to a formatted string.
Args:
doc (dict): A dictionary containing the document information.
Returns:
str: A formatted string containing the question and answer choices.
"""
candidates = doc["candidates"]
num_choices = len(candidates)
if num_choices < 2:
raise ValueError("Invalid number of candidates")
choices = letters[:num_choices]
formatted_choices = "\n".join(
[f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)]
)
return f"Galdera: {doc['question']}\n{formatted_choices}\nErantzuna:"
def doc_to_choice(doc) -> List[str]:
"""
Returns the answer choices for a document.
Args:
doc (dict): A dictionary containing the document information.
Returns:
list: A list of strings containing the answer choices.
"""
num_choices = len(doc["candidates"])
if num_choices < 2:
raise ValueError("Invalid number of candidates")
return letters[:num_choices]
......@@ -5,11 +5,11 @@ dataset_name: qqp
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "\nSentence 1: {{question1}}\nSentence 2: {{question2}}\nAnswer:"
doc_to_text: "Question 1: {{question1}}\nQuestion 2: {{question2}}\nQuestion: Do both questions ask the same thing?\nAnswer:"
doc_to_target: label
doc_to_choice: ["no", "yes"]
metric_list:
- metric: acc
- metric: f1
metadata:
version: 1.0
version: 2.0
......@@ -7,8 +7,9 @@ output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function util.doc_to_text
doc_to_target: "{{answers}}"
doc_to_choice: "{{entities}}"
doc_to_target: !function util.doc_to_target
doc_to_choice: !function util.doc_to_choice
process_docs: !function util.process_docs
process_results: !function util.process_results
metric_list:
- metric: f1
......@@ -17,4 +18,4 @@ metric_list:
higher_is_better: True
aggregation: mean
metadata:
version: 1.0
version: 2.0
import datasets
import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics
......@@ -21,6 +22,22 @@ def doc_to_target(doc):
return format_answer(query=doc["query"], entity=doc["answers"][0])
def doc_to_choice(doc):
return [format_answer(query=doc["query"], entity=ans) for ans in doc["entities"]]
def process_docs(dataset: datasets.Dataset):
def _process_doc(doc):
return {
"passage": doc["passage"],
"query": doc["query"],
"entities": sorted(list(set(doc["entities"]))),
"answers": sorted(list(set(doc["answers"]))),
}
return dataset.map(_process_doc)
def process_results(doc, results):
# ReCoRD's evaluation is actually deceptively simple:
# - Pick the maximum likelihood prediction entity
......
# TMMLU+
### Paper
Title: `An Improved Traditional Chinese Evaluation Suite for Foundation Model`
Abstract: `We present TMMLU+, a comprehensive dataset designed for the Traditional Chinese massive multitask language understanding dataset. TMMLU+ is a multiple-choice question-answering dataset with 66 subjects from elementary to professional level. Compared to its predecessor, TMMLU, TMMLU+ is six times larger and boasts a more balanced subject distribution. We included benchmark results in TMMLU+ from closed-source models and 24 open-weight Chinese large language models of parameters ranging from 1.8B to 72B. Our findings reveal that Traditional Chinese models still trail behind their Simplified Chinese counterparts. Additionally, current large language models have yet to outperform human performance in average scores. We publicly release our dataset and the corresponding benchmark source code.`
Homepage: [https://huggingface.co/datasets/ikala/tmmluplus](https://huggingface.co/datasets/ikala/tmmluplus)
### Citation
```
@article{ikala2024improved,
title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
journal={arXiv preprint arXiv:2403.01858},
year={2024}
}
```
### Groups and Tasks
#### Groups
* `tmmluplus`: `The dataset comprises 22,690 multiple-choice questions from 66 subjects ranging from primary to professional level. `
#### Tasks
The following tasks evaluate subjects in the TMMLU+ dataset using loglikelihood-based multiple-choice scoring:
* `tmmluplus_{subject_english}`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: ZoneTwelve/tmmluplus # a copy of `ikala/tmmluplus`
test_split: test
fewshot_split: train
fewshot_config:
sampler: first_n
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.1
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
import pandas as pd
import yaml
from tqdm import tqdm
# Copy from https://github.com/iKala/ievals/blob/main/ievals/settings.py
# from TMMLU+ offical example
categories = {
"STEM": [
"physics",
"chemistry",
"biology",
"computer science",
"math",
"engineering",
],
"humanities": ["history", "philosophy", "law"],
"social_sciences": [
"politics",
"culture",
"economics",
"geography",
"psychology",
"education",
],
"other": ["other", "business", "health"], # (business, health, misc.)
}
task_list = [
"engineering_math",
"dentistry",
"traditional_chinese_medicine_clinical_medicine",
"clinical_psychology",
"technical",
"culinary_skills",
"mechanical",
"logic_reasoning",
"real_estate",
"general_principles_of_law",
"finance_banking",
"anti_money_laundering",
"ttqav2",
"marketing_management",
"business_management",
"organic_chemistry",
"advance_chemistry",
"physics",
"secondary_physics",
"human_behavior",
"national_protection",
"jce_humanities",
"politic_science",
"agriculture",
"official_document_management",
"financial_analysis",
"pharmacy",
"educational_psychology",
"statistics_and_machine_learning",
"management_accounting",
"introduction_to_law",
"computer_science",
"veterinary_pathology",
"accounting",
"fire_science",
"optometry",
"insurance_studies",
"pharmacology",
"taxation",
"education_(profession_level)",
"economics",
"veterinary_pharmacology",
"nautical_science",
"occupational_therapy_for_psychological_disorders",
"trust_practice",
"geography_of_taiwan",
"physical_education",
"auditing",
"administrative_law",
"basic_medical_science",
"macroeconomics",
"trade",
"chinese_language_and_literature",
"tve_design",
"junior_science_exam",
"junior_math_exam",
"junior_chinese_exam",
"junior_social_studies",
"tve_mathematics",
"tve_chinese_language",
"tve_natural_sciences",
"junior_chemistry",
"music",
"education",
"three_principles_of_people",
"taiwanese_hokkien",
]
subject2name = {}
# subject2category = {}
SUBJECTS = {}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="tmmluplus")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
parser.add_argument("--subject_file", default="subject.tsv")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
from pathlib import Path
# Initialization
SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
for _, row in df.iterrows():
for _c in categories:
if row["subject"] in SUBJECTS:
raise ValueError("Duplicate tasks.")
if row["category"] in categories[_c]: # append new item into SUBJECTS
SUBJECTS[row["subject"]] = _c
subject2name[row["subject"]] = row["name"]
break
# End of SUBJECTS initialization
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
name_of_subject = subject2name[subject].replace("_", " ")
description = f"以下為{name_of_subject}的單選題,請提供正確答案的選項。\n\n"
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"tmmluplus_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"tmmluplus_{category}",
"group_alias": category.replace("_", " "),
"task": f"tmmluplus_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"tmmluplus_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
# eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
# width=float("inf"),
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_subcategories = [
f"tmmluplus_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_subcategories = [f"tmmluplus_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
# eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
{
"group": f"tmmluplus_{args.task_prefix}"
if args.task_prefix != ""
else "tmmluplus",
"task": mmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
group: tmmluplus
task:
- tmmluplus_other
- tmmluplus_social_sciences
- tmmluplus_humanities
- tmmluplus_STEM
"dataset_name": "accounting"
"description": "以下為會計學的單選題,請提供正確答案的選項。\n\n"
"group": "tmmluplus_other"
"group_alias": "other"
"include": "_default_template_yaml"
"task": "tmmluplus_accounting"
"task_alias": "accounting"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment