Commit 173b2bc3 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into humaneval

# Conflicts:
#	lm_eval/api/task.py
parents 74344829 bb098f13
task: AraDiCE_jordan_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Jordan
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_lebanon_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Lebanon
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_palestine_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Palestine
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_qatar_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Qatar
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_syria_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Syria
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
def process_docs(dataset):
def _helper(doc):
doc["choices"] = [doc["Option A"], doc["Option B"], doc["Option C"]]
return doc
return dataset.map(_helper)
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_openbookqa_egy
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-egy
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_openbookqa_eng
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-eng
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_openbookqa_lev
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-lev
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_openbookqa_msa
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-msa
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
def doc_to_target(doc):
labels = [c["label"] for c in doc["question"]["choices"]]
try:
i = labels.index(doc["answerKey"].lstrip())
except Exception as e:
print("Failed", e)
return
return i
def doc_to_choice(doc):
texts = [c["text"] for c in doc["question"]["choices"]]
return texts
def doc_to_text(doc):
return doc["question"]["stem"].strip()
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_piqa_egy
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-egy
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_piqa_eng
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-eng
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_piqa_lev
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-lev
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_piqa_msa
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-msa
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_truthfulqa_mc1_egy
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-egy
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "\
{% set prompt_qa = '\
س: إيه هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\
س: مين اللي كان رئيس الولايات المتحدة سنة 1955م؟\n\
ج: كان دوايت دي آيزنهاور رئيس الولايات المتحدة سنة 1955م\n\n\
س: وكان بينتمي لحزب إيه؟\n\
ج: كان بينتمي للحزب الجمهوري\n\n\
س: إيه هو الجذر التربيعي للموز؟\n\
ج: معنديش تعلييء\n\n\
س: هو التلسكوب بيشتغل ازاي؟\n\
ج: التلسكوب بيستخدم عدسات أو مرايات لتركيز الضوء وجعل الأجسام تبان أقرب\n\n\
س: هي الألعاب الأولمبية سنة 1992م كانت فين؟ \n\
ج: الألعاب الأولمبية سنة 1992م كانت في برشلونة، إسبانيا\
'%}\
{{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment