Commit 89b6bdb3 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into ai2d

parents 59053d58 144a1e58
......@@ -258,7 +258,7 @@ def doc_to_text(src: str, tgt: str) -> str:
src_name, tgt_name = map(code_to_language_name, [src, tgt])
return f"""\
{src_name} sentence: {jinja_var('sentence_' + src)}
{src_name} sentence: {jinja_var("sentence_" + src)}
{tgt_name} sentence:"""
......
task: piqa_eu
dataset_path: HiTZ/PIQA-eu
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
doc_to_text: "Galdera: {{goal}}\nErantzuna:"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
......@@ -30,6 +30,12 @@ Homepage: https://github.com/google/BIG-bench
* `group_name`: `Short description`
#### Tags
* `bigbench_generate_until`
* `bigbench_multiple_choice_a`
* `bigbench_multiple_choice_b`
#### Tasks
* `task_name`: `1-sentence description of what this particular task does`
......
group: bigbench_generate_until
tag: bigbench_generate_until
dataset_path: hails/bigbench
output_type: generate_until
dataset_kwargs:
......
group: bigbench_multiple_choice
tag: bigbench_multiple_choice_a
dataset_path: hails/bigbench
dataset_kwargs:
# num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
......
group: bigbench_multiple_choice
tag: bigbench_multiple_choice_b
dataset_path: hails/bigbench
dataset_kwargs:
# num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
......
......@@ -259,7 +259,7 @@ def doc_to_text(src: str, tgt: str) -> str:
src_name, tgt_name = map(code_to_language_name, [src, tgt])
return f"""\
{src_name} sentence: {jinja_var('sentence_' + src)}
{src_name} sentence: {jinja_var("sentence_" + src)}
{tgt_name} sentence:"""
......
# File generated by `create-yamls.py`
include: _phrases_va_common.yaml
include: _phrases_va_common
task: phrases_ca-va
doc_to_text: 'Oració en català: {{ca}}
......
# File generated by `create-yamls.py`
include: _phrases_va_common.yaml
include: _phrases_va_common
task: phrases_va-ca
doc_to_text: 'Oració en valencià: {{va}}
......
......@@ -7,7 +7,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
### Context: {doc["context"]}
### Question: {doc["question"]}
### Options:
(1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
(1) {doc["option#1"]}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc["option#4"]}\n(5) {doc["option#5"]}
### Answer: 주어진 문제의 정답은"""
out_doc = {
......
......@@ -258,7 +258,7 @@ def doc_to_text(src: str, tgt: str) -> str:
src_name, tgt_name = map(code_to_language_name, [src, tgt])
return f"""\
{src_name} sentence: {jinja_var('sentence_' + src)}
{src_name} sentence: {jinja_var("sentence_" + src)}
{tgt_name} sentence:"""
......
# Global-MMLU
### Paper
Title: `Global MMLU: Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation`
Abstract: [https://arxiv.org/abs/2412.03304](https://arxiv.org/abs/2412.03304)
Global-MMLU 🌍 is a multilingual evaluation set spanning 42 languages, including English. This dataset combines machine translations for MMLU questions along with professional translations and crowd-sourced post-edits. It also includes cultural sensitivity annotations for a subset of the questions (2850 questions per language) and classifies them as Culturally Sensitive (CS) 🗽 or Culturally Agnostic (CA) ⚖️. These annotations were collected as part of an open science initiative led by Cohere For AI in collaboration with many external collaborators from both industry and academia.
Global-MMLU-Lite is a balanced collection of culturally sensitive and culturally agnostic MMLU tasks. It is designed for efficient evaluation of multilingual models in 15 languages (including English). Only languages with human translations and post-edits in the original [Global-MMLU](https://huggingface.co/datasets/CohereForAI/Global-MMLU) 🌍 dataset have been included in the lite version.
Homepage: \
[https://huggingface.co/datasets/CohereForAI/Global-MMLU](https://huggingface.co/datasets/CohereForAI/Global-MMLU) \
[https://huggingface.co/datasets/CohereForAI/Global-MMLU-Lite](https://huggingface.co/datasets/CohereForAI/Global-MMLU-Lite)
#### Groups
* `global_mmlu_{lang}`: This group uses `Global-MMLU-Lite` benchmark which supports 14 languages.
* `global_mmlu_full_{lang}`: This group uses `Global-MMLU` benchmark which supports 42 languages.
#### Subgroups (support only for `full` version)
* `global_mmlu_full_stem`
* `global_mmlu_full_humanities`
* `global_mmlu_full_social_sciences`
* `global_mmlu_full_other`
### Citation
```bibtex
@misc{singh2024globalmmluunderstandingaddressing,
title={Global MMLU: Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation},
author={Shivalika Singh and Angelika Romanou and Clémentine Fourrier and David I. Adelani and Jian Gang Ngui and Daniel Vila-Suero and Peerat Limkonchotiwat and Kelly Marchisio and Wei Qi Leong and Yosephine Susanto and Raymond Ng and Shayne Longpre and Wei-Yin Ko and Madeline Smith and Antoine Bosselut and Alice Oh and Andre F. T. Martins and Leshem Choshen and Daphne Ippolito and Enzo Ferrante and Marzieh Fadaee and Beyza Ermis and Sara Hooker},
year={2024},
eprint={2412.03304},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.03304},
}
```
dataset_path: CohereForAI/Global-MMLU-Lite
dataset_name: ar
test_split: test
fewshot_split: dev
fewshot_config:
sampler: default
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
group: global_mmlu_ar
task:
- global_mmlu_ar_business
- global_mmlu_ar_humanities
- global_mmlu_ar_medical
- global_mmlu_ar_other
- global_mmlu_ar_stem
- global_mmlu_ar_social_sciences
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
# Generated by _generate_configs.py
include: _ar_template_yaml
process_docs: !function utils.process_business
task: global_mmlu_ar_business
# Generated by _generate_configs.py
include: _ar_template_yaml
process_docs: !function utils.process_humanities
task: global_mmlu_ar_humanities
# Generated by _generate_configs.py
include: _ar_template_yaml
process_docs: !function utils.process_medical
task: global_mmlu_ar_medical
# Generated by _generate_configs.py
include: _ar_template_yaml
process_docs: !function utils.process_other
task: global_mmlu_ar_other
# Generated by _generate_configs.py
include: _ar_template_yaml
process_docs: !function utils.process_social_sciences
task: global_mmlu_ar_social_sciences
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment