Commit 60c9c170 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'main' into inverse-scaling-tasks

parents 4b2d565b b4cd85d4
# Generated by utils.py
dataset_name: undo_permutation_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_undo_permutation_multiple_choice
# Generated by utils.py
dataset_name: unit_conversion_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_unit_conversion_multiple_choice
# Generated by utils.py
dataset_name: unit_interpretation_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_unit_interpretation_multiple_choice
# Generated by utils.py
dataset_name: unnatural_in_context_learning_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_unnatural_in_context_learning_multiple_choice
# Generated by utils.py
dataset_name: vitaminc_fact_verification_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_vitaminc_fact_verification_multiple_choice
# Generated by utils.py
dataset_name: what_is_the_tao_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_what_is_the_tao_multiple_choice
# Generated by utils.py
dataset_name: which_wiki_edit_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_which_wiki_edit_multiple_choice
# Generated by utils.py
dataset_name: winowhy_zero_shot
include: ../multiple_choice_template_yaml
include: ../multiple_choice_template_a_yaml
task: bigbench_winowhy_multiple_choice
# Generated by utils.py
dataset_name: word_sorting_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_word_sorting_multiple_choice
# Generated by utils.py
dataset_name: word_unscrambling_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_word_unscrambling_multiple_choice
......@@ -12,4 +12,4 @@ metric_list:
- metric: acc
# TODO: brier score and other metrics
metadata:
version: 0.0
version: 1.0
group: bigbench_multiple_choice
dataset_path: hails/bigbench
dataset_kwargs:
# num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
# subtask_name: null
output_type: multiple_choice
test_split: default
doc_to_text: inputs
doc_to_target: "{{multiple_choice_scores.index(1)}}"
doc_to_choice: "{{multiple_choice_targets}}"
metric_list:
- metric: acc
# TODO: brier score and other metrics
metadata:
version: 1.0
......@@ -7,7 +7,7 @@ import os
import yaml
from tqdm import tqdm
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
SUBJECTS = {
......
# COPAL
### Paper
Title: `COPAL-ID: Indonesian Language Reasoning with Local Culture and Nuances`
Abstract: `https://arxiv.org/abs/2311.01012`
`COPAL-ID is an Indonesian causal commonsense reasoning dataset that captures local nuances. It provides a more natural portrayal of day-to-day causal reasoning within the Indonesian (especially Jakartan) cultural sphere. Professionally written and validatid from scratch by natives, COPAL-ID is more fluent and free from awkward phrases, unlike the translated XCOPA-ID.`
Homepage: `https://github.com/haryoa/copal-id`
### Citation
```
@article{wibowo2023copal,
title={COPAL-ID: Indonesian Language Reasoning with Local Culture and Nuances},
author={Wibowo, Haryo Akbarianto and Fuadi, Erland Hilman and Nityasya, Made Nindyatama and Prasojo, Radityo Eko and Aji, Alham Fikri},
journal={arXiv preprint arXiv:2311.01012},
year={2023}
}
```
### Groups and Tasks
#### Groups
* `copal_id`
#### Tasks
* `copal_id_standard`: `Standard version of COPAL dataset, use formal language and less local nuances`
* `copal_id_colloquial`: `Colloquial version of COPAL dataset, use informal language and more local nuances`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
include: standard.yaml
task: copal_id_colloquial
task_alias: colloquial
test_split: test_colloquial
group: copal_id
task: copal_id_standard
task_alias: standard
dataset_path: haryoaw/COPAL
dataset_name: id
output_type: multiple_choice
test_split: test
doc_to_text: !function utils.doc_to_text_id
doc_to_target: label
doc_to_choice: !function utils.doc_to_choice
metric_list:
- metric: acc
metadata:
version: 1.0
from functools import partial
def convert_choice(choice):
return choice[0].lower() + choice[1:]
def doc_to_text(doc, connector):
conn = connector[doc["question"]]
return doc["premise"].strip()[:-1] + f" {conn}"
def doc_to_choice(doc):
return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]
doc_to_text_id = partial(
doc_to_text,
connector={
"cause": "karena",
"effect": "maka",
},
)
# EusExams
### Paper
Title: Latxa: An Open Language Model and Evaluation Suite for Basque
Abstract: https://arxiv.org/abs/2403.20266
EusExams is a collection of tests designed to prepare individuals for Public Service examinations conducted by several Basque institutions, including the public health system Osakidetza, the Basque Government, the City Councils of Bilbao and Gasteiz, and the University of the Basque Country (UPV/EHU). Within each of these groups, there are different exams for public positions, such as administrative and assistant roles. Each multiple-choice question contains 2 to 4 choices (3.90 on average) and one correct answer. The dataset is mostly parallel with 16k questions in Basque and 18k in Spanish.
Homepage: https://github.com/hitz-zentroa/latxa
### Citation
```
@misc{etxaniz2024latxa,
title={Latxa: An Open Language Model and Evaluation Suite for Basque},
author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
year={2024},
eprint={2403.20266},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
* `eus_exams_eu`: The Basque version of the exams.
* `eus_exams_es`: The Spanish version of the exams.
#### Tasks
Basque and Spanish versions of the exams are available as separate tasks starting with `eus_exams_eu` and `eus_exams_es` respectively.
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import argparse
import json
import requests
import yaml
# get configs from huggingface datasets server by doing a request
response = requests.get(
"https://datasets-server.huggingface.co/splits?dataset=HiTZ%2FEusExams", timeout=5
)
response_json = json.loads(response.text)
CONFIGS = [split["config"] for split in response_json["splits"]]
def gen_config_yamls(output_dir: str, overwrite: bool) -> None:
"""
Generate a yaml file for each configuage.
:param output_dir: The directory to output the files to.
:param overwrite: Whether to overwrite files if they already exist.
"""
err = []
for config in CONFIGS:
file_name = f"eus_exams_{config}.yaml"
try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "eus_exams_es"
if "eus_exams_es" in config
else "eus_exams_eu",
"dataset_name": config,
"task": f"eus_exams_{config}",
},
f,
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist (use --overwrite flag):"
f" {', '.join(err)}"
)
def main() -> None:
"""Parse CLI args and generate configuage-specific yaml files."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
args = parser.parse_args()
gen_config_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
if __name__ == "__main__":
main()
dataset_path: HiTZ/EusExams
dataset_name: null
validation_split: null
test_split: test
fewshot_split: test
process_docs: !function utils.process_docs
output_type: multiple_choice
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment