Unverified Commit 383bbd54 authored by Jess's avatar Jess Committed by GitHub
Browse files

Irokobench: Benchmark Dataset for African languages (#2042)



* add afrixnli to task

* add chat completion

* remove chat completion -untested

* afrimmlu added

* afrimmlu folder update

* afrimmlu folder update

* updated prompt

* remove print

* add afrimgsm -direct

* add squad metric

* fix bash script

* remove direct util, update common yaml

* remove print

* add few show. metric fixes

* fix direct path, add bash script for gpt models

* added transate test

* update afrixnli tasks

* update afrixnli tasks

* update metrics for afrixnli

* prompt translations fix

* prompt translations fix

* filter and metric fix -mgsm

* remove squad metric

* remove squad metric

* add f1 score to mgsm

* add f1 score to mgsm

* update native-direct with lin

* change f1 function

* add lin to utils

* add utils

* remove test limit

* remove test configs

* add swahili to mmlu

* change eng to ewe in ewe yaml mmlu

* add squad metric to mgsm, remove whitespace filter

* added translate test

* added afrixnli_translate

* fix exact match valueError

* fix exact match valueError

* restructure mmlu folder

* spacing

* remove afrimmlu_translate folder

* add utility

* format task name, clean ups

* modefied mgsm

* update on afrimgsm

* update on afrimgsm

* removed utils

* other mgsm varieties

* other mgsm varieties

* adding trasnslate direct

* Update translate_direct_yaml

* add manual xnli prompt, add multichoice for openai models, and adapt multichoice metric for openai model

* edit for open models

* Update translate_direct_yaml

* add verbalizer for xnli

* change xnli from multiple choice to generate

* add manual accuracy scores

* revert xnli to multiple choice

* change afrimgsm utils

* revert xnli to multiple_choice

* cleanups and readmes

* remove openai fixes and unused regex

* pr review changes

* revert metrics.py, task.py and extraction.py to main version

---------
Co-authored-by: default avatarIsrael Abebe Azime <azime@cg.uni-saarland.de>
Co-authored-by: default avatarIsrael Abebe Azime <se.israel.abebe@gmail.com>
parent d5f39bf8
# Generated by utils.py
dataset_name: yor
doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: translate_direct_yaml
task: afrimgsm_translate_direct_yor
# Generated by utils.py
dataset_name: zul
doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: translate_direct_yaml
task: afrimgsm_translate_direct_zul
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group:
- mgsm
- afrimgsm
- afrimgsm_translate
dataset_path: masakhane/afrimgsm-translate-test
dataset_name: null # Overridden by language-specific config.
output_type: generate_until
test_split: test
generation_kwargs:
until:
- "\n\n"
- "\n"
do_sample: false
temperature: 0.0
target_delimiter: " "
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 2.0
import argparse
import yaml
languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', 'yor',
'hau', 'sot', 'swa']
languages_REGEX = {"eng": "The answer is (\\-?[0-9\\.\\,]+)",
"amh": "መልሱ (\\-?[0-9\\.\\,]+)",
"ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
'fra': "La réponse est(\\-?[0-9\\.\\,]+)",
'sna': "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
'lin': "Eyano ezali (\\-?[0-9\\.\\,]+)",
'wol': "Tontu li (\\-?[0-9\\.\\,]+)",
'ewe': "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
'lug': "Ansa eri (\\-?[0-9\\.\\,]+)",
'xho': "Impendulo ngu (\\-?[0-9\\.\\,]+)",
'kin': "Igisubizo ni (\\-?[0-9\\.\\,]+)",
'twi': "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
'zul': "Impendulo ithi (\\-?[0-9\\.\\,]+)",
'orm': "Deebiin isaa (\\-?[0-9\\.\\,]+)",
'yor': "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
'hau': "Amsar ita ce (\\-?[0-9\\.\\,]+)",
'sot': "Karabo ke (\\-?[0-9\\.\\,]+)",
'swa': "Jibu ni (\\-?[0-9\\.\\,]+)",
}
LANGUAGES = {}
for lang in languages:
if lang == 'amh':
LANGUAGES[lang] = { # English
"QUESTION": "ጥያቄ:",
"ANSWER": "በቅደም ተከተል መልስ:",
"DIRECT": "Answer:",
"REGEX": languages_REGEX[lang]}
elif lang == 'yor':
LANGUAGES[lang] = { # English
"QUESTION": "Ìbéèrè:",
"ANSWER": "Ìdáhùn lẹ́sẹsẹ:",
"DIRECT": "Answer:",
"REGEX": languages_REGEX[lang]}
else:
LANGUAGES[lang] = { # English
"QUESTION": "Question:",
"ANSWER": "Step-by-Step Answer:",
"DIRECT": "Answer:",
"REGEX": languages_REGEX[lang]}
def add_regex_pattern(regex_pattern):
if regex_pattern is None:
return {}
return {
"filter_list": [
{
"name": "strict-match",
"filter": [
{
"function": "regex",
"regex_pattern": f"""{regex_pattern}""",
},
{
"function": "take_first",
},
],
},
{
"name": "flexible-extract",
"filter": [
{
"function": "regex",
"regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
"group_select": -1,
},
{
"function": "take_first",
},
],
},
],
}
def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
"""
Generate a yaml file for each language.
:param output_dir: The directory to output the files to.
:param overwrite: Whether to overwrite files if they already exist.
"""
err = []
for lang in LANGUAGES.keys():
try:
yaml_template = "cot_yaml"
filter_list = {}
DELIMITER = None
if mode == "direct":
ANSWER = LANGUAGES['eng']["DIRECT"]
QUESTION = LANGUAGES['eng']["QUESTION"]
REGEX = None
task_name = f"afrimgsm_direct_{lang}"
yaml_template = "direct_yaml"
if mode == "direct-native":
ANSWER = LANGUAGES[lang]["DIRECT"]
QUESTION = LANGUAGES[lang]["QUESTION"]
REGEX = None
task_name = f"afrimgsm_direct_native_{lang}"
yaml_template = "direct_native_yaml"
elif mode == "native-cot":
ANSWER = LANGUAGES[lang]["ANSWER"]
REGEX = LANGUAGES[lang]["REGEX"]
QUESTION = LANGUAGES[lang]["QUESTION"]
task_name = f"afrimgsm_native_cot_{lang}"
filter_list = add_regex_pattern(REGEX)
DELIMITER = "" if lang in ["zh", "ja"] else None
elif mode == "en-cot":
ANSWER = LANGUAGES["eng"]["ANSWER"]
REGEX = LANGUAGES["eng"]["REGEX"]
QUESTION = LANGUAGES["eng"]["QUESTION"]
task_name = f"afrimgsm_en_cot_{lang}"
elif mode == "translate-direct":
ANSWER = LANGUAGES['eng']["DIRECT"]
QUESTION = LANGUAGES['eng']["QUESTION"]
REGEX = None
task_name = f"afrimgsm_translate_direct_{lang}"
yaml_template = "translate_direct_yaml"
file_name = f"{task_name}.yaml"
ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": yaml_template,
"dataset_name": lang,
"task": f"{task_name}",
"doc_to_text": f"""{{% if answer is not none %}}"""
f"""{{{{question+"\\n{ANSWER}"}}}}"""
f"""{{% else %}}"""
f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
f"""{{% endif %}}""",
"doc_to_target": f"""{{% if answer is not none %}}"""
f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
f"""{{% else %}}"""
f"""{{{{answer_number|string}}}}"""
f"""{{% endif %}}""",
**filter_list,
"generation_kwargs": {
"until": [QUESTION, "</s>", "<|im_end|>"],
"do_sample": False,
},
**({"target_delimiter": DELIMITER} if DELIMITER else {}),
},
f,
allow_unicode=True,
width=float("inf"),
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist (use --overwrite flag):"
f" {', '.join(err)}"
)
def main() -> None:
"""Parse CLI args and generate language-specific yaml files."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
parser.add_argument(
"--mode",
default="native-cot",
choices=["direct", "direct-native", "native-cot", "en-cot", "translate-direct"],
help="Mode of chain-of-thought",
)
args = parser.parse_args()
gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
if __name__ == "__main__":
main()
\ No newline at end of file
# MathQA
### Paper
IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
https://arxiv.org/pdf/2406.03368
IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
low-resource African languages covering three tasks: natural language inference (AfriXNLI),
mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
### Citation
```
@misc{adelani2024irokobenchnewbenchmarkafrican,
title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
year={2024},
eprint={2406.03368},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2406.03368},
}
```
### Groups and Tasks
#### Groups
* `afrimmlu`: All afrimmlu tasks
* `afrimmlu_direct`: afrimmlu_direct evaluates models performance on the curated dataset
* `afrimmlu_translate`: afrimmlu_translate evaluates models in translate-test setting
#### Tasks
* `afrimmlu_direct_{language_code}`: each task evaluates for one language
* `afrimmlu_translate_{language_code}`: each task evaluates for one language
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
* [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
group:
- mmlu
- afrimmlu
- afrimmlu_direct
task: null
dataset_path: masakhane/afrimmlu
dataset_name: null
output_type: multiple_choice
validation_split: validation
test_split: test
fewshot_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: f1
aggregation: !function utils.weighted_f1_score
# aggregation: mean
average: weighted
hf_evaluate: true
higher_is_better: True
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
- "\\$"
- metric: acc
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
- "\\$"
metadata:
version: 1.0
dataset_name: amh
include: afrimmlu_common_yaml
task: afrimmlu_direct_amh
dataset_name: eng
include: afrimmlu_common_yaml
task: afrimmlu_direct_eng
dataset_name: ewe
include: afrimmlu_common_yaml
task: afrimmlu_direct_ewe
dataset_name: fra
include: afrimmlu_common_yaml
task: afrimmlu_direct_fra
\ No newline at end of file
dataset_name: hau
include: afrimmlu_common_yaml
task: afrimmlu_direct_hau
\ No newline at end of file
dataset_name: ibo
include: afrimmlu_common_yaml
task: afrimmlu_direct_ibo
\ No newline at end of file
dataset_name: kin
include: afrimmlu_common_yaml
task: afrimmlu_direct_kin
\ No newline at end of file
dataset_name: lin
include: afrimmlu_common_yaml
task: afrimmlu_direct_lin
\ No newline at end of file
dataset_name: lug
include: afrimmlu_common_yaml
task: afrimmlu_direct_lug
\ No newline at end of file
dataset_name: orm
include: afrimmlu_common_yaml
task: afrimmlu_direct_orm
\ No newline at end of file
dataset_name: sna
include: afrimmlu_common_yaml
task: afrimmlu_direct_sna
\ No newline at end of file
dataset_name: sot
include: afrimmlu_common_yaml
task: afrimmlu_direct_sot
\ No newline at end of file
dataset_name: swa
include: afrimmlu_common_yaml
task: afrimmlu_direct_swa
\ No newline at end of file
dataset_name: twi
include: afrimmlu_common_yaml
task: afrimmlu_direct_twi
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment