Merge branch 'main' into prefill

4eecbabb · Baber · dac8b534 · fb963f0f · 4eecbabb · 4eecbabb
Commit 4eecbabb authored Sep 16, 2024 by Baber
20 changed files
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_medicine.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_medicine.yaml
+task: arabic_leaderboard_arabic_mmlu_professional_medicine
+dataset_path: OALL/Arabic_MMLU
+dataset_name: professional_medicine
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_psychology.yaml
+task: arabic_leaderboard_arabic_mmlu_professional_psychology
+dataset_path: OALL/Arabic_MMLU
+dataset_name: professional_psychology
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_public_relations.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_public_relations.yaml
+task: arabic_leaderboard_arabic_mmlu_public_relations
+dataset_path: OALL/Arabic_MMLU
+dataset_name: public_relations
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_security_studies.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_security_studies.yaml
+task: arabic_leaderboard_arabic_mmlu_security_studies
+dataset_path: OALL/Arabic_MMLU
+dataset_name: security_studies
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_sociology.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_sociology.yaml
+task: arabic_leaderboard_arabic_mmlu_sociology
+dataset_path: OALL/Arabic_MMLU
+dataset_name: sociology
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml
+task: arabic_leaderboard_arabic_mmlu_us_foreign_policy
+dataset_path: OALL/Arabic_MMLU
+dataset_name: us_foreign_policy
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_virology.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_virology.yaml
+task: arabic_leaderboard_arabic_mmlu_virology
+dataset_path: OALL/Arabic_MMLU
+dataset_name: virology
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_world_religions.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_world_religions.yaml
+task: arabic_leaderboard_arabic_mmlu_world_religions
+dataset_path: OALL/Arabic_MMLU
+dataset_name: world_religions
+output_type: multiple_choice
+training_split: null
+validation_split: dev
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/utils.py
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/utils.py
+import datasets
+import numpy as np
+
+
+# fmt: off
+LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"]
+# fmt: on
+
+
+# fmt: off
+LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+# fmt: on
+
+
+def process_docs(dataset: datasets.Dataset):
+    def _process_doc(doc):
+        topic = doc["subject"]
+        instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n"
+        choices = [doc["A"], doc["B"], doc["C"], doc["D"]]
+        # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES,
+        # it will then be applied to arabic letters
+        gold_ix = LETTER_INDICES.index(doc["answer"])
+
+        query = f"{instruction}{doc['question']}\n"
+        query += "".join(
+            [
+                f"{key}. {choice}\n"
+                for key, choice in zip(LETTER_INDICES_AR[:4], choices)
+            ]
+        )
+        query += "الإجابة:"
+
+        return {"query": query, "choices": LETTER_INDICES_AR[:4], "gold": gold_ix}
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_leaderboard_arabic_mt_arc_challenge.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_leaderboard_arabic_mt_arc_challenge.yaml
+group: arabic_leaderboard_arabic_mt_arc_challenge
+task:
+  - arabic_mt_arc_challenge
+
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_mt_arc_challenge.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_mt_arc_challenge.yaml
+task: arabic_mt_arc_challenge
+dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+dataset_name: arc_challenge_okapi_ar
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/utils.py
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/utils.py
+import datasets
+import numpy as np
+
+
+def process_docs(dataset: datasets.Dataset):
+    def _process_doc(doc):
+        question = doc["query"]
+        answer_index = int(doc["label"])
+        # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label'
+        choices_keys = [
+            key for key in doc.keys() if key not in ["query", "label", "__few_shots"]
+        ]
+        choices = [doc[key] for key in choices_keys]
+
+        instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
+        query = f"{instruction}السؤال: {question}\n"
+        for index, choice in enumerate(choices):
+            query += f"{index}) {choice}\n"
+        query += "الإجابة:"
+
+        return {"query": query, "choices": choices, "gold": answer_index}
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_leaderboard_arabic_mt_arc_easy.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_leaderboard_arabic_mt_arc_easy.yaml
+group: arabic_leaderboard_arabic_mt_arc_easy
+task:
+  - arabic_mt_arc_easy
+
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_mt_arc_easy.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_mt_arc_easy.yaml
+task: arabic_mt_arc_easy
+dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+dataset_name: arc_easy_ar
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/utils.py
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/utils.py
+import datasets
+import numpy as np
+
+
+def process_docs(dataset: datasets.Dataset):
+    def _process_doc(doc):
+        question = doc["query"]
+        answer_index = int(doc["label"])
+        # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label'
+        choices_keys = [
+            key for key in doc.keys() if key not in ["query", "label", "__few_shots"]
+        ]
+        choices = [doc[key] for key in choices_keys]
+
+        instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
+        query = f"{instruction}السؤال: {question}\n"
+        for index, choice in enumerate(choices):
+            query += f"{index}) {choice}\n"
+        query += "الإجابة:"
+
+        return {"query": query, "choices": choices, "gold": answer_index}
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_leaderboard_arabic_mt_boolq.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_leaderboard_arabic_mt_boolq.yaml
+group: arabic_leaderboard_arabic_mt_boolq
+task:
+  - arabic_mt_boolq
+
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml
+task: arabic_mt_boolq
+dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+dataset_name: boolq_ar
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/utils.py
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/utils.py
+import datasets
+import numpy as np
+
+
+def process_docs(dataset: datasets.Dataset):
+    def _process_doc(doc):
+        question = doc["question"]
+        passage = doc["passage"]
+        instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا"
+        query = f"""{instruction}
+        المقطع :
+        {passage}
+        السؤال:
+        {question}
+        الإجابة:
+        """
+
+        return {
+            "query": query,
+            "choices": ["نعم", "لا"],
+            "gold": 0 if doc["answer"] else 1,
+        }
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_leaderboard_arabic_mt_copa.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_leaderboard_arabic_mt_copa.yaml
+group: arabic_leaderboard_arabic_mt_copa
+task:
+  - arabic_mt_copa
+
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_mt_copa.yaml
+++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_mt_copa.yaml
+task: arabic_mt_copa
+dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
+dataset_name: copa_ext_ar
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "choices"
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0