Commit 2184b8de authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'cont-metrics' of https://github.com/EleutherAI/lm-evaluation-harness into alt_worlds

parents b1ba4e71 1522009c
...@@ -58,7 +58,7 @@ class TextSynthLM(LM): ...@@ -58,7 +58,7 @@ class TextSynthLM(LM):
@property @property
def eot_token_id(self): def eot_token_id(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
@property @property
...@@ -72,20 +72,20 @@ class TextSynthLM(LM): ...@@ -72,20 +72,20 @@ class TextSynthLM(LM):
@property @property
def batch_size(self): def batch_size(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
@property @property
def device(self): def device(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def tok_encode(self, string: str): def tok_encode(self, string: str):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def tok_decode(self, tokens): def tok_decode(self, tokens):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood(self, requests): def loglikelihood(self, requests):
...@@ -122,7 +122,7 @@ class TextSynthLM(LM): ...@@ -122,7 +122,7 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth." "input tokenization support from TextSynth."
) )
def greedy_until(self, requests): def generate_until(self, requests):
if not requests: if not requests:
return [] return []
...@@ -146,7 +146,7 @@ class TextSynthLM(LM): ...@@ -146,7 +146,7 @@ class TextSynthLM(LM):
s = resp["text"] s = resp["text"]
res.append(s) res.append(s)
self.cache_hook.add_partial("greedy_until", (inp, request_args), s) self.cache_hook.add_partial("generate_until", (inp, request_args), s)
else: else:
logger.error( logger.error(
f"The following response does not contain generated `text`. " f"The following response does not contain generated `text`. "
...@@ -160,5 +160,5 @@ class TextSynthLM(LM): ...@@ -160,5 +160,5 @@ class TextSynthLM(LM):
raise NotImplementedError() raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
...@@ -59,6 +59,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -59,6 +59,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] MGSM - [x] MGSM
- [ ] SCROLLS - [ ] SCROLLS
- [x] Babi - [x] Babi
- [x] Belebele
# Novel Tasks # Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*. Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
......
...@@ -4,7 +4,6 @@ from typing import List, Union, Dict ...@@ -4,7 +4,6 @@ from typing import List, Union, Dict
from lm_eval import utils from lm_eval import utils
from lm_eval import prompts from lm_eval import prompts
from lm_eval.logger import eval_logger
from lm_eval.api.task import TaskConfig, Task, ConfigurableTask from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
from lm_eval.api.registry import ( from lm_eval.api.registry import (
register_task, register_task,
...@@ -14,6 +13,10 @@ from lm_eval.api.registry import ( ...@@ -14,6 +13,10 @@ from lm_eval.api.registry import (
ALL_TASKS, ALL_TASKS,
) )
import logging
eval_logger = logging.getLogger("lm-eval")
def register_configurable_task(config: Dict[str, str]) -> int: def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type( SubClass = type(
...@@ -98,7 +101,7 @@ def check_prompt_config( ...@@ -98,7 +101,7 @@ def check_prompt_config(
] ]
) )
}, },
**{"output_type": "greedy_until"}, **{"output_type": "generate_until"},
} }
) )
else: else:
...@@ -145,7 +148,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None: ...@@ -145,7 +148,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
except Exception as error: except Exception as error:
import traceback import traceback
eval_logger.warning( eval_logger.debug(
"Failed to load config in\n" "Failed to load config in\n"
f" {yaml_path}\n" f" {yaml_path}\n"
" Config will not be added to registry\n" " Config will not be added to registry\n"
......
task: babi task: babi
dataset_path: Muennighoff/babi dataset_path: Muennighoff/babi
dataset_name: null dataset_name: null
output_type: greedy_until output_type: generate_until
training_split: train training_split: train
validation_split: valid validation_split: valid
test_split: test test_split: test
......
group: bbh_flan_cot_fewshot group: bbh_flan_cot_fewshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
group: bbh_flan_cot_zeroshot group: bbh_flan_cot_zeroshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
group: bbh_flan_fewshot group: bbh_flan_fewshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
group: bbh_flan_zeroshot group: bbh_flan_zeroshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
# Belebele
### Paper
The Belebele Benchmark for Massively Multilingual NLU Evaluation
https://arxiv.org/abs/2308.16884
Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. This dataset enables the evaluation of mono- and multi-lingual models in high-, medium-, and low-resource languages. Each question has four multiple-choice answers and is linked to a short passage from the FLORES-200 dataset. The human annotation procedure was carefully curated to create questions that discriminate between different levels of generalizable language comprehension and is reinforced by extensive quality checks. While all questions directly relate to the passage, the English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. Belebele opens up new avenues for evaluating and analyzing the multilingual abilities of language models and NLP systems.
Homepage: https://github.com/facebookresearch/belebele
### Citation
```bibtex
@misc{bandarkar2023belebele,
title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
year={2023},
eprint={2308.16884},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
- `belebele`: All 122 languages of the Belebele dataset, evaluated following the methodology in MMLU's original implementation.
#### Tasks
The following tasks evaluate languages in the Belebele dataset using loglikelihood-based multiple-choice scoring:
- `belebele_{language}`
The variant evaluated here is the 0-shot or few-shot evaluation with English Instructions.
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [ ] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: belebele
dataset_path: facebook/belebele
test_split: test
fewshot_split: test
fewshot_config:
sampler: first_n
output_type: multiple_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question}}"
doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
import requests
from tqdm import tqdm
from lm_eval.logger import eval_logger
API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="belebele")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
def query():
response = requests.get(API_URL)
return response.json()["splits"]
languages = [split["config"] for split in query()]
for lang in tqdm(languages):
yaml_dict = {
"include": base_yaml_name,
"task": f"belebele_{args.task_prefix}_{lang}"
if args.task_prefix != ""
else f"belebele_{lang}",
"dataset_name": lang,
}
file_save_path = args.save_prefix_path + f"_{lang}.yaml"
eval_logger.info(f"Saving yaml for subset {lang} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "acm_Arab"
"include": "_default_template_yaml"
"task": "belebele_acm_Arab"
"dataset_name": "afr_Latn"
"include": "_default_template_yaml"
"task": "belebele_afr_Latn"
"dataset_name": "als_Latn"
"include": "_default_template_yaml"
"task": "belebele_als_Latn"
"dataset_name": "amh_Ethi"
"include": "_default_template_yaml"
"task": "belebele_amh_Ethi"
"dataset_name": "apc_Arab"
"include": "_default_template_yaml"
"task": "belebele_apc_Arab"
"dataset_name": "arb_Arab"
"include": "_default_template_yaml"
"task": "belebele_arb_Arab"
"dataset_name": "arb_Latn"
"include": "_default_template_yaml"
"task": "belebele_arb_Latn"
"dataset_name": "ars_Arab"
"include": "_default_template_yaml"
"task": "belebele_ars_Arab"
"dataset_name": "ary_Arab"
"include": "_default_template_yaml"
"task": "belebele_ary_Arab"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment