"...resnet50_tensorflow.git" did not exist on "51cb03b0b7fdb7c1bd41d0f5cf5583ea50a98610"
Commit 4eecbabb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into prefill

parents dac8b534 fb963f0f
task: arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: multiple_choice_facts_truefalse_balanced_task
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: multiple_choice_grounded_statement_soqal_task
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: multiple_choice_grounded_statement_xglue_mlqa_task
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: multiple_choice_rating_sentiment_no_neutral_task
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: multiple_choice_rating_sentiment_task
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_multiple_choice_sentiment_task
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: multiple_choice_sentiment_task
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import datasets
import numpy as np
def process_docs(dataset: datasets.Dataset):
def _process_doc(doc):
question = doc["query"]
answer_index = int(doc["label"])
# Dynamically determining the choices by excluding '__few_shots', 'query' and 'label'
choices_keys = [
key for key in doc.keys() if key not in ["query", "label", "__few_shots"]
]
choices = [doc[key] for key in choices_keys]
instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
query = f"{instruction}السؤال: {question}\n"
for index, choice in enumerate(choices):
query += f"{index}) {choice}\n"
query += "الإجابة:"
return {"query": query, "choices": choices, "gold": answer_index}
return dataset.map(_process_doc)
task: arabic_exams
dataset_path: OALL/Arabic_EXAMS
dataset_name: default
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
group: arabic_leaderboard_arabic_exams
task:
- arabic_exams
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
import datasets
import numpy as np
# fmt: off
LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"]
# fmt: on
# fmt: off
LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
# fmt: on
def process_docs(dataset: datasets.Dataset):
def _process_doc(doc):
topic = doc["subject"]
question = doc["question"]
choices = [doc["A"], doc["B"], doc["C"], doc["D"]]
choices_formatted = [
f" {LETTER_INDICES_AR[i]}) {choice}\n" for i, choice in enumerate(choices)
]
answer = doc["answer"]
answer_index = LETTER_INDICES.index(answer)
instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n"
query = f"{instruction}السؤال: {question}\n"
query += "\n".join(choices_formatted)
query += "\nالإجابة:"
return {"query": query, "choices": LETTER_INDICES_AR[:4], "gold": answer_index}
return dataset.map(_process_doc)
group: arabic_leaderboard_arabic_mmlu
task:
- arabic_leaderboard_arabic_mmlu_abstract_algebra
- arabic_leaderboard_arabic_mmlu_anatomy
- arabic_leaderboard_arabic_mmlu_astronomy
- arabic_leaderboard_arabic_mmlu_business_ethics
- arabic_leaderboard_arabic_mmlu_clinical_knowledge
- arabic_leaderboard_arabic_mmlu_college_biology
- arabic_leaderboard_arabic_mmlu_college_chemistry
- arabic_leaderboard_arabic_mmlu_college_computer_science
- arabic_leaderboard_arabic_mmlu_college_mathematics
- arabic_leaderboard_arabic_mmlu_college_medicine
- arabic_leaderboard_arabic_mmlu_college_physics
- arabic_leaderboard_arabic_mmlu_computer_security
- arabic_leaderboard_arabic_mmlu_conceptual_physics
- arabic_leaderboard_arabic_mmlu_econometrics
- arabic_leaderboard_arabic_mmlu_electrical_engineering
- arabic_leaderboard_arabic_mmlu_elementary_mathematics
- arabic_leaderboard_arabic_mmlu_formal_logic
- arabic_leaderboard_arabic_mmlu_global_facts
- arabic_leaderboard_arabic_mmlu_high_school_biology
- arabic_leaderboard_arabic_mmlu_high_school_chemistry
- arabic_leaderboard_arabic_mmlu_high_school_computer_science
- arabic_leaderboard_arabic_mmlu_high_school_european_history
- arabic_leaderboard_arabic_mmlu_high_school_geography
- arabic_leaderboard_arabic_mmlu_high_school_government_and_politics
- arabic_leaderboard_arabic_mmlu_high_school_macroeconomics
- arabic_leaderboard_arabic_mmlu_high_school_mathematics
- arabic_leaderboard_arabic_mmlu_high_school_microeconomics
- arabic_leaderboard_arabic_mmlu_high_school_physics
- arabic_leaderboard_arabic_mmlu_high_school_psychology
- arabic_leaderboard_arabic_mmlu_high_school_statistics
- arabic_leaderboard_arabic_mmlu_high_school_us_history
- arabic_leaderboard_arabic_mmlu_high_school_world_history
- arabic_leaderboard_arabic_mmlu_human_aging
- arabic_leaderboard_arabic_mmlu_human_sexuality
- arabic_leaderboard_arabic_mmlu_international_law
- arabic_leaderboard_arabic_mmlu_jurisprudence
- arabic_leaderboard_arabic_mmlu_logical_fallacies
- arabic_leaderboard_arabic_mmlu_machine_learning
- arabic_leaderboard_arabic_mmlu_management
- arabic_leaderboard_arabic_mmlu_marketing
- arabic_leaderboard_arabic_mmlu_medical_genetics
- arabic_leaderboard_arabic_mmlu_miscellaneous
- arabic_leaderboard_arabic_mmlu_moral_disputes
- arabic_leaderboard_arabic_mmlu_moral_scenarios
- arabic_leaderboard_arabic_mmlu_nutrition
- arabic_leaderboard_arabic_mmlu_philosophy
- arabic_leaderboard_arabic_mmlu_prehistory
- arabic_leaderboard_arabic_mmlu_professional_accounting
- arabic_leaderboard_arabic_mmlu_professional_law
- arabic_leaderboard_arabic_mmlu_professional_medicine
- arabic_leaderboard_arabic_mmlu_professional_psychology
- arabic_leaderboard_arabic_mmlu_public_relations
- arabic_leaderboard_arabic_mmlu_security_studies
- arabic_leaderboard_arabic_mmlu_sociology
- arabic_leaderboard_arabic_mmlu_us_foreign_policy
- arabic_leaderboard_arabic_mmlu_virology
- arabic_leaderboard_arabic_mmlu_world_religions
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_abstract_algebra
dataset_path: OALL/Arabic_MMLU
dataset_name: abstract_algebra
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_anatomy
dataset_path: OALL/Arabic_MMLU
dataset_name: anatomy
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_astronomy
dataset_path: OALL/Arabic_MMLU
dataset_name: astronomy
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_business_ethics
dataset_path: OALL/Arabic_MMLU
dataset_name: business_ethics
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_clinical_knowledge
dataset_path: OALL/Arabic_MMLU
dataset_name: clinical_knowledge
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_college_biology
dataset_path: OALL/Arabic_MMLU
dataset_name: college_biology
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_college_chemistry
dataset_path: OALL/Arabic_MMLU
dataset_name: college_chemistry
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_college_computer_science
dataset_path: OALL/Arabic_MMLU
dataset_name: college_computer_science
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_arabic_mmlu_college_mathematics
dataset_path: OALL/Arabic_MMLU
dataset_name: college_mathematics
output_type: multiple_choice
training_split: null
validation_split: dev
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: dev
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment