Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml
+task: AraDiCE_jordan_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Jordan
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml
+task: AraDiCE_lebanon_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Lebanon
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/metrics.py
+++ b/lm_eval/tasks/aradice/cultural-benchmark/metrics.py
+from sklearn.metrics import f1_score
+
+
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+
+
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml
+task: AraDiCE_palestine_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Palestine
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml
+task: AraDiCE_qatar_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Qatar
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml
+++ b/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml
+task: AraDiCE_syria_cultural
+dataset_path: QCRI/AraDiCE-Culture
+dataset_name: Syria
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "سؤال : {{Question}}\nإجابة :"
+doc_to_target: 0
+doc_to_choice: choices
+should_decontaminate: true
+doc_to_decontamination_query: Question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/cultural-benchmark/utils.py
+++ b/lm_eval/tasks/aradice/cultural-benchmark/utils.py
+def process_docs(dataset):
+    def _helper(doc):
+        doc["choices"] = [doc["Option A"], doc["Option B"], doc["Option C"]]
+        return doc
+
+    return dataset.map(_helper)
--- a/lm_eval/tasks/aradice/openbookqa/metrics.py
+++ b/lm_eval/tasks/aradice/openbookqa/metrics.py
+from sklearn.metrics import f1_score
+
+
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+
+
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml
+task: AraDiCE_openbookqa_egy
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-egy
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml
+task: AraDiCE_openbookqa_eng
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-eng
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml
+task: AraDiCE_openbookqa_lev
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-lev
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml
+++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml
+task: AraDiCE_openbookqa_msa
+dataset_path: QCRI/AraDiCE-OpenBookQA
+dataset_name: OBQA-msa
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question.stem}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/openbookqa/utils.py
+++ b/lm_eval/tasks/aradice/openbookqa/utils.py
+def doc_to_target(doc):
+    labels = [c["label"] for c in doc["question"]["choices"]]
+
+    try:
+        i = labels.index(doc["answerKey"].lstrip())
+    except Exception as e:
+        print("Failed", e)
+        return
+    return i
+
+
+def doc_to_choice(doc):
+    texts = [c["text"] for c in doc["question"]["choices"]]
+    return texts
+
+
+def doc_to_text(doc):
+    return doc["question"]["stem"].strip()
--- a/lm_eval/tasks/aradice/piqa/metrics.py
+++ b/lm_eval/tasks/aradice/piqa/metrics.py
+from sklearn.metrics import f1_score
+
+
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+
+
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/piqa/piqa_egy.yaml
+++ b/lm_eval/tasks/aradice/piqa/piqa_egy.yaml
+task: AraDiCE_piqa_egy
+dataset_path: QCRI/AraDiCE-PIQA
+dataset_name: PIQA-egy
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: "سؤال : {{goal}}\nإجابة :"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/piqa/piqa_eng.yaml
+++ b/lm_eval/tasks/aradice/piqa/piqa_eng.yaml
+task: AraDiCE_piqa_eng
+dataset_path: QCRI/AraDiCE-PIQA
+dataset_name: PIQA-eng
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: "سؤال : {{goal}}\nإجابة :"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/piqa/piqa_lev.yaml
+++ b/lm_eval/tasks/aradice/piqa/piqa_lev.yaml
+task: AraDiCE_piqa_lev
+dataset_path: QCRI/AraDiCE-PIQA
+dataset_name: PIQA-lev
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: "سؤال : {{goal}}\nإجابة :"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/piqa/piqa_msa.yaml
+++ b/lm_eval/tasks/aradice/piqa/piqa_msa.yaml
+task: AraDiCE_piqa_msa
+dataset_path: QCRI/AraDiCE-PIQA
+dataset_name: PIQA-msa
+training_split: null
+validation_split: null
+test_split: test
+output_type: multiple_choice
+doc_to_text: "سؤال : {{goal}}\nإجابة :"
+doc_to_target: label
+doc_to_choice: "{{[sol1, sol2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/truthfulqa_mcq/metrics.py
+++ b/lm_eval/tasks/aradice/truthfulqa_mcq/metrics.py
+from sklearn.metrics import f1_score
+
+
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+
+
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_egy.yaml
+++ b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_egy.yaml
+task: AraDiCE_truthfulqa_mc1_egy
+dataset_path: QCRI/AraDiCE-TruthfulQA
+dataset_name: TruthfulQA-egy
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "\
+  {% set prompt_qa = '\
+  س: إيه هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
+  ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\
+  س: مين اللي كان رئيس الولايات المتحدة سنة 1955م؟\n\
+  ج: كان دوايت دي آيزنهاور رئيس الولايات المتحدة سنة 1955م\n\n\
+  س: وكان بينتمي لحزب إيه؟\n\
+  ج: كان بينتمي للحزب الجمهوري\n\n\
+  س: إيه هو الجذر التربيعي للموز؟\n\
+  ج: معنديش  تعلييء\n\n\
+  س: هو التلسكوب بيشتغل ازاي؟\n\
+  ج: التلسكوب بيستخدم عدسات أو مرايات لتركيز الضوء وجعل الأجسام تبان أقرب\n\n\
+  س: هي  الألعاب الأولمبية سنة 1992م كانت فين؟ \n\
+  ج: الألعاب الأولمبية سنة 1992م كانت في برشلونة، إسبانيا\
+  '%}\
+  {{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
+doc_to_target: 0
+doc_to_choice: "{{mc1_targets.choices}}"
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0