Merge branch 'main' into longcxt

2b56339e · Baber · 0b533339 · 703fbffd · 2b56339e · 2b56339e
Commit 2b56339e authored Jan 17, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/aradice/boolq/MSA/boolq_msa.yaml
+++ b/lm_eval/tasks/aradice/boolq/MSA/boolq_msa.yaml
+task: AraDiCE_boolq_msa
+dataset_path: QCRI/AraDiCE-BoolQ
+dataset_name: BoolQ-msa
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:"
+doc_to_target: target
+doc_to_choice: ["لا", "نعم"]
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/boolq/MSA/metrics.py
+++ b/lm_eval/tasks/aradice/boolq/MSA/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/boolq/MSA/utils.py
+++ b/lm_eval/tasks/aradice/boolq/MSA/utils.py
+msa_answer_mapping = {"true": "نعم", "false": "لا", True: "نعم", False: "لا"}
+def process_docs(dataset):
+    def remove_question_mark(text):
+        text = text.strip()
+        if text.endswith("?") or text.endswith("؟"):
+            text = text[:-1]
+        text = text.strip()
+        return text
+    def _helper(doc):
+        doc["question"] = remove_question_mark(doc["question"])
+        doc["target"] = msa_answer_mapping[doc["answer"]]
+        return doc
+    return dataset.map(_helper)
--- a/lm_eval/tasks/aradice/cultural-benchmark/egypt.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/egypt.yaml
+task: AraDiCE_egypt_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Egypt
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml
+task: AraDiCE_jordan_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Jordan
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml
+task: AraDiCE_lebanon_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Lebanon
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/metrics.py
+++ b/lm_eval/tasks/aradice/cultural-benchmark/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml
+task: AraDiCE_palestine_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Palestine
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml
+task: AraDiCE_qatar_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Qatar
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml
+task: AraDiCE_syria_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Syria
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/utils.py
+++ b/lm_eval/tasks/aradice/cultural-benchmark/utils.py
+def process_docs(dataset):
+    def _helper(doc):
+        doc["choices"] = [doc["Option A"], doc["Option B"], doc["Option C"]]
+        return doc
+    return dataset.map(_helper)
--- a/lm_eval/tasks/aradice/openbookqa/metrics.py
+++ b/lm_eval/tasks/aradice/openbookqa/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml
+task: AraDiCE_openbookqa_egy
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-egy
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml
+task: AraDiCE_openbookqa_eng
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-eng
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml
+task: AraDiCE_openbookqa_lev
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-lev
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml
+task: AraDiCE_openbookqa_msa
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-msa
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/utils.py
+++ b/lm_eval/tasks/aradice/openbookqa/utils.py
+def doc_to_target(doc):
+    labels = [c["label"] for c in doc["question"]["choices"]]
+    try:
+        i = labels.index(doc["answerKey"].lstrip())
+    except Exception as e:
+        print("Failed", e)
+        return
+    return i
+def doc_to_choice(doc):
+    texts = [c["text"] for c in doc["question"]["choices"]]
+    return texts
+def doc_to_text(doc):
+    return doc["question"]["stem"].strip()
--- a/lm_eval/tasks/aradice/piqa/metrics.py
+++ b/lm_eval/tasks/aradice/piqa/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/piqa/piqa_egy.yaml
+++ b/lm_eval/tasks/aradice/piqa/piqa_egy.yaml
+task: AraDiCE_piqa_egy
+dataset_path: QCRI/AraDiCE-PIQA
+dataset_name: PIQA-egy
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: "سؤال : {{goal}}\nإجابة :"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/piqa/piqa_eng.yaml
+++ b/lm_eval/tasks/aradice/piqa/piqa_eng.yaml
+task: AraDiCE_piqa_eng
+dataset_path: QCRI/AraDiCE-PIQA
+dataset_name: PIQA-eng
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: "سؤال : {{goal}}\nإجابة :"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0