Commit 173b2bc3 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into humaneval

# Conflicts:
#	lm_eval/api/task.py
parents 74344829 bb098f13
group: arabic_leaderboard_arabic_mt_sciq_light
task:
- arabic_mt_sciq_light
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
task: arabic_mt_sciq_light
dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent
dataset_name: sciq_ar
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import random
import datasets
import numpy as np
def doc_to_text(doc):
instruction = (
"بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات"
)
support = doc["support"]
question = doc["question"]
query = f"""{instruction}
السياق:
{support}
السؤال:
{question}
الإجابات المحتملة:
"""
return query
def process_docs(dataset: datasets.Dataset):
def _process_doc(doc):
correct_answer = doc["correct_answer"]
choices = [
doc["distractor1"],
doc["distractor2"],
doc["distractor3"],
correct_answer,
]
# Shuffle the choices
random.shuffle(choices)
answer_index = choices.index(correct_answer)
return {"query": doc_to_text(doc), "choices": choices, "gold": answer_index}
return dataset.map(_process_doc)
group: arabic_leaderboard_arabic_mt_toxigen_light
task:
- arabic_mt_toxigen_light
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
task: arabic_mt_toxigen_light
dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent
dataset_name: toxigen_ar
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import datasets
import numpy as np
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
text = doc["text"]
label = 1 if ((doc["toxicity_ai"] + doc["toxicity_human"]) > 5.5) else 0
instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".'
query = f"""{instruction}
العبارة:
'{text}'
الإجابة:
"""
out_doc = {
"query": query,
"choices": ["لا", "نعم"],
"gold": label,
}
return out_doc
return dataset.map(_process_doc)
task: arabic_leaderboard_acva_Algeria_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Algeria
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Ancient_Egypt_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Ancient_Egypt
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arab_Empire_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arab_Empire
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Architecture_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Architecture
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Art_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Art
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Astronomy_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Astronomy
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Calligraphy_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Calligraphy
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Ceremony_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Ceremony
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Clothing_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Clothing
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Culture_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Culture
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Food_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Food
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Funeral_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Funeral
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_Geography_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_Geography
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_acva_Arabic_History_light
dataset_path: arcee-globe/ACVA-10percent
dataset_name: Arabic_History
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment