Commit 948f120f authored by Baber's avatar Baber
Browse files

Merge branch 'main' into autobatchtest

# Conflicts:
#	lm_eval/models/huggingface.py
parents a5b1c7a8 bd80a6c0
task: arabic_leaderboard_acva_Mauritania_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Mauritania
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Mesopotamia_civilization_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Mesopotamia_civilization
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Morocco_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Morocco
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Oman_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Oman
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Palestine_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Palestine
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Qatar_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Qatar
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Saudi_Arabia_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Saudi_Arabia
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Somalia_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Somalia
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Sudan_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Sudan
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Syria_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Syria
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Tunisia_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Tunisia
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_United_Arab_Emirates_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: United_Arab_Emirates
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Yemen_light
dataset_path: OALL/ACVA
dataset_name: Yemen
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_communication_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: communication
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_computer_and_phone_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: computer_and_phone
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_daily_life_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: daily_life
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_entertainment_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: entertainment
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
group: arabic_leaderboard_acva_light
task:
- arabic_leaderboard_acva_Algeria_light
- arabic_leaderboard_acva_Ancient_Egypt_light
- arabic_leaderboard_acva_Arab_Empire_light
- arabic_leaderboard_acva_Arabic_Architecture_light
- arabic_leaderboard_acva_Arabic_Art_light
- arabic_leaderboard_acva_Arabic_Astronomy_light
- arabic_leaderboard_acva_Arabic_Calligraphy_light
- arabic_leaderboard_acva_Arabic_Ceremony_light
- arabic_leaderboard_acva_Arabic_Clothing_light
- arabic_leaderboard_acva_Arabic_Culture_light
- arabic_leaderboard_acva_Arabic_Food_light
- arabic_leaderboard_acva_Arabic_Funeral_light
- arabic_leaderboard_acva_Arabic_Geography_light
- arabic_leaderboard_acva_Arabic_History_light
- arabic_leaderboard_acva_Arabic_Language_Origin_light
- arabic_leaderboard_acva_Arabic_Literature_light
- arabic_leaderboard_acva_Arabic_Math_light
- arabic_leaderboard_acva_Arabic_Medicine_light
- arabic_leaderboard_acva_Arabic_Music_light
- arabic_leaderboard_acva_Arabic_Ornament_light
- arabic_leaderboard_acva_Arabic_Philosophy_light
- arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light
- arabic_leaderboard_acva_Arabic_Wedding_light
- arabic_leaderboard_acva_Bahrain_light
- arabic_leaderboard_acva_Comoros_light
- arabic_leaderboard_acva_Egypt_modern_light
- arabic_leaderboard_acva_InfluenceFromAncientEgypt_light
- arabic_leaderboard_acva_InfluenceFromByzantium_light
- arabic_leaderboard_acva_InfluenceFromChina_light
- arabic_leaderboard_acva_InfluenceFromGreece_light
- arabic_leaderboard_acva_InfluenceFromIslam_light
- arabic_leaderboard_acva_InfluenceFromPersia_light
- arabic_leaderboard_acva_InfluenceFromRome_light
- arabic_leaderboard_acva_Iraq_light
- arabic_leaderboard_acva_Islam_Education_light
- arabic_leaderboard_acva_Islam_branches_and_schools_light
- arabic_leaderboard_acva_Islamic_law_system_light
- arabic_leaderboard_acva_Jordan_light
- arabic_leaderboard_acva_Kuwait_light
- arabic_leaderboard_acva_Lebanon_light
- arabic_leaderboard_acva_Libya_light
- arabic_leaderboard_acva_Mauritania_light
- arabic_leaderboard_acva_Mesopotamia_civilization_light
- arabic_leaderboard_acva_Morocco_light
- arabic_leaderboard_acva_Oman_light
- arabic_leaderboard_acva_Palestine_light
- arabic_leaderboard_acva_Qatar_light
- arabic_leaderboard_acva_Saudi_Arabia_light
- arabic_leaderboard_acva_Somalia_light
- arabic_leaderboard_acva_Sudan_light
- arabic_leaderboard_acva_Syria_light
- arabic_leaderboard_acva_Tunisia_light
- arabic_leaderboard_acva_United_Arab_Emirates_light
- arabic_leaderboard_acva_Yemen_light
- arabic_leaderboard_acva_communication_light
- arabic_leaderboard_acva_computer_and_phone_light
- arabic_leaderboard_acva_daily_life_light
- arabic_leaderboard_acva_entertainment_light
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
import datasets
import numpy as np
def process_docs(dataset: datasets.Dataset):
def _process_doc(doc):
question = doc["question"]
answer = doc["answer"]
return {
"query": f"السؤال: {question}\nالإجابة:",
"choices": ["صح", "خطأ"],
"gold": ["صح", "خطأ"].index(answer),
}
return dataset.map(_process_doc)
group: arabic_leaderboard_light
task:
- arabic_leaderboard_acva_light
- arabic_leaderboard_alghafa_light
- arabic_leaderboard_arabic_exams_light
- arabic_leaderboard_arabic_mt_arc_challenge_light
- arabic_leaderboard_arabic_mt_arc_easy_light
- arabic_leaderboard_arabic_mt_boolq_light
- arabic_leaderboard_arabic_mt_hellaswag_light
- arabic_leaderboard_arabic_mt_mmlu_light
- arabic_leaderboard_arabic_mt_copa_light
- arabic_leaderboard_arabic_mt_openbook_qa_light
- arabic_leaderboard_arabic_mt_piqa_light
- arabic_leaderboard_arabic_mt_race_light
- arabic_leaderboard_arabic_mt_sciq_light
- arabic_leaderboard_arabic_mt_toxigen_light
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment