Commit 2b56339e authored by Baber's avatar Baber
Browse files

Merge branch 'main' into longcxt

parents 0b533339 703fbffd
task: AraDiCE_boolq_msa
dataset_path: QCRI/AraDiCE-BoolQ
dataset_name: BoolQ-msa
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:"
doc_to_target: target
doc_to_choice: ["لا", "نعم"]
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
msa_answer_mapping = {"true": "نعم", "false": "لا", True: "نعم", False: "لا"}
def process_docs(dataset):
def remove_question_mark(text):
text = text.strip()
if text.endswith("?") or text.endswith("؟"):
text = text[:-1]
text = text.strip()
return text
def _helper(doc):
doc["question"] = remove_question_mark(doc["question"])
doc["target"] = msa_answer_mapping[doc["answer"]]
return doc
return dataset.map(_helper)
task: AraDiCE_egypt_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Egypt
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_jordan_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Jordan
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_lebanon_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Lebanon
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_palestine_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Palestine
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_qatar_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Qatar
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_syria_cultural
dataset_path: QCRI/AraDiCE-Culture
dataset_name: Syria
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "سؤال : {{Question}}\nإجابة :"
doc_to_target: 0
doc_to_choice: choices
should_decontaminate: true
doc_to_decontamination_query: Question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
def process_docs(dataset):
def _helper(doc):
doc["choices"] = [doc["Option A"], doc["Option B"], doc["Option C"]]
return doc
return dataset.map(_helper)
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_openbookqa_egy
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-egy
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_openbookqa_eng
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-eng
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_openbookqa_lev
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-lev
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_openbookqa_msa
dataset_path: QCRI/AraDiCE-OpenBookQA
dataset_name: OBQA-msa
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "{{question.stem}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
def doc_to_target(doc):
labels = [c["label"] for c in doc["question"]["choices"]]
try:
i = labels.index(doc["answerKey"].lstrip())
except Exception as e:
print("Failed", e)
return
return i
def doc_to_choice(doc):
texts = [c["text"] for c in doc["question"]["choices"]]
return texts
def doc_to_text(doc):
return doc["question"]["stem"].strip()
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
task: AraDiCE_piqa_egy
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-egy
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_piqa_eng
dataset_path: QCRI/AraDiCE-PIQA
dataset_name: PIQA-eng
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: "سؤال : {{goal}}\nإجابة :"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment