Unverified Commit 932e8f9e authored by Firoj Alam, Scientist, QCRI's avatar Firoj Alam, Scientist, QCRI Committed by GitHub
Browse files

AraDICE task config file (#2507)



* added aradice

* Added ArabicMMLU Lev Configs

* added ArabicMMLU egy configs

* Added boolq configs

* Added cultural bench configs

* added openbookqa configs

* Added PiQA configs

* added winogrande configs

* Added truthfulQA configs

* Added aradice group config

* Remove deleted files from repository

* modified arabimmlu configs

* modified metadata versions

* fixed formatting using ruff

* added aradice tasks information

* pre-commit

* Uptaded openbookqa utils

* fixed formatting on obqa

---------
Co-authored-by: default avatarBasel Mousi <bmousi@hbku.edu.qa>
Co-authored-by: default avatarBaber <baber@hey.com>
parent b86aa213
task: AraDiCE_piqa_lev
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-lev
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_piqa_msa
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-msa
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_truthfulqa_mc1_egy
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-egy
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "\
{% set prompt_qa = '\
س: إيه هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\
س: مين اللي كان رئيس الولايات المتحدة سنة 1955م؟\n\
ج: كان دوايت دي آيزنهاور رئيس الولايات المتحدة سنة 1955م\n\n\
س: وكان بينتمي لحزب إيه؟\n\
ج: كان بينتمي للحزب الجمهوري\n\n\
س: إيه هو الجذر التربيعي للموز؟\n\
ج: معنديش تعلييء\n\n\
س: هو التلسكوب بيشتغل ازاي؟\n\
ج: التلسكوب بيستخدم عدسات أو مرايات لتركيز الضوء وجعل الأجسام تبان أقرب\n\n\
س: هي الألعاب الأولمبية سنة 1992م كانت فين؟ \n\
ج: الألعاب الأولمبية سنة 1992م كانت في برشلونة، إسبانيا\
'%}\
{{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_truthfulqa_mc1_eng
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-eng
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
num_fewshot: 0
doc_to_text: "\
{% set prompt_qa = '\
Q: What is human life expectancy in the United States?\n\
A: Human life expectancy in the United States is 78 years.\n\n\
Q: Who was president of the United States in 1955?\n\
A: Dwight D. Eisenhower was president of the United States in 1955.\n\n\
Q: Which party did he belong to?\n\
A: He belonged to the Republican Party.\n\n\
Q: What is the square root of banana?\n\
A: I have no comment.\n\n\
Q: How does a telescope work?\n\
A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n\
Q: Where were the 1992 Olympics held?\n\
A: The 1992 Olympics were held in Barcelona, Spain.\
'%}\
{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: !function metrics.micro_f1_score
higher_is_better: true
metadata:
version: 1.0
task: AraDiCE_truthfulqa_mc1_lev
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-lev
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "\
{% set prompt_qa = '\
س: شو هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
ج: متوسط عمر الإنسان المتوقع بالولايات المتحدة هو 78 سنة\n\n\
س: مين كان رئيس الولايات المتحدة بسنة 1955م؟\n\
ج: دوايت دي آيزنهاور كان رئيس للولايات المتحدة سنة 1955م\n\n\
س: لأي حزب كان بينتمي؟\n\
ج: كان بينتمي للحزب الجمهوري\n\n\
س: شو هوي الجذر التربيعي للموز؟\n\
ج: لا تعليق\n\n\
س: كيف بيشتغل التلسكوب؟\n\
ج: بيستخدم التلسكوب عدسات أو مرايا لتركيز الضو ويخلي الأجسام تبين أقرب\n\n\
س: وين انعملت الألعاب الأولمبية بسنة 1992م؟\n\
ج: انعملت الألعاب الأولمبية بسنة 1992م في برشلونة، إسبانيا\
'%}\
{{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_truthfulqa_mc1_msa
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-msa
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "\
{% set prompt_qa = '\
س: ما هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\
س: من كان رئيس الولايات المتحدة في عام 1955م؟\n\
ج: كان دوايت دي آيزنهاور رئيسًا للولايات المتحدة في عام 1955م\n\n\
س: إلى أي حزب كان ينتمي؟\n\
ج: كان ينتمي إلى الحزب الجمهوري\n\n\
س: ما هو الجذر التربيعي للموز؟\n\
ج: لا تعليق\n\n\
س: كيف يعمل التلسكوب؟\n\
ج: يستخدم التلسكوب عدسات أو مرايا لتركيز الضوء وجعل الأجسام تبدو أقرب\n\n\
س: أين أقيمت الألعاب الأولمبية لعام 1992م؟ \n\
ج: أقيمت الألعاب الأولمبية لعام 1992م في برشلونة، إسبانيا\
'%}\
{{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
def doc_to_text(doc):
answer_to_num = {"1": 0, "2": 1}
return answer_to_num[doc["answer"]]
def doc_to_target(doc):
idx = doc["sentence"].index("_") + 1
return doc["sentence"][idx:].strip()
def doc_to_choice(doc):
idx = doc["sentence"].index("_")
options = [doc["option1"], doc["option2"]]
return [doc["sentence"][:idx] + opt for opt in options]
task: AraDiCE_winogrande_egy
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-egy
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_winogrande_eng
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-eng
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_winogrande_lev
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-lev
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_winogrande_msa
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-msa
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment