Commit ba73d131 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into group-agg-rework
parents 6e2dbe76 269b66e9
"dataset_name": "Primary Natural Science" "dataset_name": "Primary Natural Science"
"group": "arabicmmlu_stem" "tag": "arabicmmlu_stem_tasks"
"group_alias": "stem"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_primary_natural_science" "task": "arabicmmlu_primary_natural_science"
"task_alias": "Primary Natural Science" "task_alias": "Primary Natural Science"
"dataset_name": "Primary Social Science" "dataset_name": "Primary Social Science"
"group": "arabicmmlu_social_science" "tag": "arabicmmlu_social_science_tasks"
"group_alias": "social science"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_primary_social_science" "task": "arabicmmlu_primary_social_science"
"task_alias": "Primary Social Science" "task_alias": "Primary Social Science"
"dataset_name": "Prof Law" "dataset_name": "Prof Law"
"group": "arabicmmlu_humanities" "tag": "arabicmmlu_humanities_tasks"
"group_alias": "humanities"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_prof_law" "task": "arabicmmlu_prof_law"
"task_alias": "Prof Law" "task_alias": "Prof Law"
"dataset_name": "Univ Accounting" "dataset_name": "Univ Accounting"
"group": "arabicmmlu_social_science" "tag": "arabicmmlu_social_science_tasks"
"group_alias": "social science"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_univ_accounting" "task": "arabicmmlu_univ_accounting"
"task_alias": "Univ Accounting" "task_alias": "Univ Accounting"
"dataset_name": "Univ Computer Science" "dataset_name": "Univ Computer Science"
"group": "arabicmmlu_stem" "tag": "arabicmmlu_stem_tasks"
"group_alias": "stem"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_univ_computer_science" "task": "arabicmmlu_univ_computer_science"
"task_alias": "Univ Computer Science" "task_alias": "Univ Computer Science"
"dataset_name": "Univ Economics" "dataset_name": "Univ Economics"
"group": "arabicmmlu_social_science" "tag": "arabicmmlu_social_science_tasks"
"group_alias": "social science"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_univ_economics" "task": "arabicmmlu_univ_economics"
"task_alias": "Univ Economics" "task_alias": "Univ Economics"
"dataset_name": "Univ Management" "dataset_name": "Univ Management"
"group": "arabicmmlu_other" "tag": "arabicmmlu_other_tasks"
"group_alias": "other"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_univ_management" "task": "arabicmmlu_univ_management"
"task_alias": "Univ Management" "task_alias": "Univ Management"
"dataset_name": "Univ Political Science" "dataset_name": "Univ Political Science"
"group": "arabicmmlu_social_science" "tag": "arabicmmlu_social_science_tasks"
"group_alias": "social science"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "arabicmmlu_univ_political_science" "task": "arabicmmlu_univ_political_science"
"task_alias": "Univ Political Science" "task_alias": "Univ Political Science"
group: tag:
- arc_challenge_mt - arc_challenge_mt
task: arc_challenge_mt_fi task: arc_challenge_mt_fi
dataset_path: LumiOpen/arc_challenge_mt dataset_path: LumiOpen/arc_challenge_mt
......
group: basque-glue tag: basque-glue
task: bec2016eu task: bec2016eu
dataset_path: orai-nlp/basqueGLUE dataset_path: orai-nlp/basqueGLUE
dataset_name: bec dataset_name: bec
......
group: basque-glue tag: basque-glue
task: bhtc_v2 task: bhtc_v2
dataset_path: orai-nlp/basqueGLUE dataset_path: orai-nlp/basqueGLUE
dataset_name: bhtc dataset_name: bhtc
......
group: basque-glue tag: basque-glue
task: epec_koref_bin task: epec_koref_bin
dataset_path: orai-nlp/basqueGLUE dataset_path: orai-nlp/basqueGLUE
dataset_name: coref dataset_name: coref
......
group: basque-glue tag: basque-glue
task: qnlieu task: qnlieu
dataset_path: orai-nlp/basqueGLUE dataset_path: orai-nlp/basqueGLUE
dataset_name: qnli dataset_name: qnli
......
group: basque-glue tag: basque-glue
task: vaxx_stance task: vaxx_stance
dataset_path: orai-nlp/basqueGLUE dataset_path: orai-nlp/basqueGLUE
dataset_name: vaxx dataset_name: vaxx
......
group: basque-glue tag: basque-glue
task: wiceu task: wiceu
dataset_path: orai-nlp/basqueGLUE dataset_path: orai-nlp/basqueGLUE
dataset_name: wic dataset_name: wic
......
...@@ -5,7 +5,7 @@ task: ...@@ -5,7 +5,7 @@ task:
- bbh_cot_fewshot_date_understanding - bbh_cot_fewshot_date_understanding
- bbh_cot_fewshot_disambiguation_qa - bbh_cot_fewshot_disambiguation_qa
- bbh_cot_fewshot_dyck_languages - bbh_cot_fewshot_dyck_languages
- bbh_cot_fewshot_formal_languages - bbh_cot_fewshot_formal_fallacies
- bbh_cot_fewshot_geometric_shapes - bbh_cot_fewshot_geometric_shapes
- bbh_cot_fewshot_hyperbaton - bbh_cot_fewshot_hyperbaton
- bbh_cot_fewshot_logical_deduction_five_objects - bbh_cot_fewshot_logical_deduction_five_objects
......
group: bertaqa tag: bertaqa
dataset_path: HiTZ/BertaQA dataset_path: HiTZ/BertaQA
dataset_name: null dataset_name: null
validation_split: null validation_split: null
......
group: tag:
- inverse_scaling_mc - inverse_scaling_mc
output_type: multiple_choice output_type: multiple_choice
test_split: train test_split: train
......
# | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
# | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281|
# | | |none | 0|acc_norm|0.4476|± |0.0281|
# |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6273|± |0.0096|
# | | |none | 0|acc |0.6210|± |0.0095|
# | - inverse_scaling_neqa | 0|none | 0|acc |0.5300|± |0.0289|
# | | |none | 0|acc_norm|0.5300|± |0.0289|
# | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9367|± |0.0141|
# | | |none | 0|acc_norm|0.9367|± |0.0141|
# | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7178|± |0.0150|
# | | |none | 0|acc_norm|0.7178|± |0.0150|
# | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3786|± |0.0239|
# | | |none | 0|acc_norm|0.4126|± |0.0243|
# | Groups |Version|Filter|n-shot| Metric |Value | |Stderr|
# |------------------|-------|------|-----:|--------|-----:|---|-----:|
# |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6273|± |0.0096|
# | | |none | 0|acc |0.6210|± |0.0095|
# hf (pretrained=facebook/opt-2.7b,add_bos_token=True,dtype=float32), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (32)
# | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
# | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281|
# | | |none | 0|acc_norm|0.4476|± |0.0281|
# |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6291|± |0.0095|
# | | |none | 0|acc |0.6219|± |0.0095|
# | - inverse_scaling_neqa | 0|none | 0|acc |0.5267|± |0.0289|
# | | |none | 0|acc_norm|0.5267|± |0.0289|
# | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9433|± |0.0134|
# | | |none | 0|acc_norm|0.9433|± |0.0134|
# | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7200|± |0.0150|
# | | |none | 0|acc_norm|0.7200|± |0.0150|
# | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3762|± |0.0239|
# | | |none | 0|acc_norm|0.4150|± |0.0243|
# | Groups |Version|Filter|n-shot| Metric |Value | |Stderr|
# |------------------|-------|------|-----:|--------|-----:|---|-----:|
# |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6291|± |0.0095|
# | | |none | 0|acc |0.6219|± |0.0095|
group: leaderboard_bbh
dataset_path: SaylorTwift/bbh dataset_path: SaylorTwift/bbh
output_type: multiple_choice output_type: multiple_choice
test_split: test test_split: test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment