Commit 173b2bc3 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into humaneval

# Conflicts:
#	lm_eval/api/task.py
parents 74344829 bb098f13
"dataset_name": "na_language_arabic-language-grammar"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_language_egy"
"task": "AraDiCE_ArabicMMLU_na_language_arabic-language-grammar_egy"
"task_alias": "na language arabic-language-grammar"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "na_other_driving-test"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_other_egy"
"task": "AraDiCE_ArabicMMLU_na_other_driving-test_egy"
"task_alias": "na other driving-test"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "na_other_general-knowledge"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_other_egy"
"task": "AraDiCE_ArabicMMLU_na_other_general-knowledge_egy"
"task_alias": "na other general-knowledge"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_humanities_history"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_humanities_egy"
"task": "AraDiCE_ArabicMMLU_primary_humanities_history_egy"
"task_alias": "primary humanities history"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_humanities_islamic-studies"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_humanities_egy"
"task": "AraDiCE_ArabicMMLU_primary_humanities_islamic-studies_egy"
"task_alias": "primary humanities islamic-studies"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_language_arabic-language"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_language_egy"
"task": "AraDiCE_ArabicMMLU_primary_language_arabic-language_egy"
"task_alias": "primary language arabic-language"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_other_general-knowledge"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_other_egy"
"task": "AraDiCE_ArabicMMLU_primary_other_general-knowledge_egy"
"task_alias": "primary other general-knowledge"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_social-science_geography"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_egy"
"task": "AraDiCE_ArabicMMLU_primary_social-science_geography_egy"
"task_alias": "primary social-science geography"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_social-science_social-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_egy"
"task": "AraDiCE_ArabicMMLU_primary_social-science_social-science_egy"
"task_alias": "primary social-science social-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_stem_computer-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_stem_egy"
"task": "AraDiCE_ArabicMMLU_primary_stem_computer-science_egy"
"task_alias": "primary stem computer-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_stem_math"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_stem_egy"
"task": "AraDiCE_ArabicMMLU_primary_stem_math_egy"
"task_alias": "primary stem math"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "primary_stem_natural-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_stem_egy"
"task": "AraDiCE_ArabicMMLU_primary_stem_natural-science_egy"
"task_alias": "primary stem natural-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "prof_humanities_law"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_humanities_egy"
"task": "AraDiCE_ArabicMMLU_prof_humanities_law_egy"
"task_alias": "prof humanities law"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_other_management"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_other_egy"
"task": "AraDiCE_ArabicMMLU_univ_other_management_egy"
"task_alias": "univ other management"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_social-science_accounting"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_egy"
"task": "AraDiCE_ArabicMMLU_univ_social-science_accounting_egy"
"task_alias": "univ social-science accounting"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_social-science_economics"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_egy"
"task": "AraDiCE_ArabicMMLU_univ_social-science_economics_egy"
"task_alias": "univ social-science economics"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_social-science_political-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_egy"
"task": "AraDiCE_ArabicMMLU_univ_social-science_political-science_egy"
"task_alias": "univ social-science political-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_stem_computer-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_stem_egy"
"task": "AraDiCE_ArabicMMLU_univ_stem_computer-science_egy"
"task_alias": "univ stem computer-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
dataset_path: "QCRI/AraDICE-ArabicMMLU-egy"
fewshot_config:
sampler: default
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{prompt}}"
doc_to_choice: choices
doc_to_target: target
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 0.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment