Commit 9fc24ab4 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

make explicit group configs for leaderboard and other newer tasks

parent b03c7636
group:
tag:
- arc_challenge_mt
task: arc_challenge_mt_fi
dataset_path: LumiOpen/arc_challenge_mt
......
group: basque-glue
tag: basque-glue
task: bec2016eu
dataset_path: orai-nlp/basqueGLUE
dataset_name: bec
......
group: basque-glue
tag: basque-glue
task: bhtc_v2
dataset_path: orai-nlp/basqueGLUE
dataset_name: bhtc
......
group: basque-glue
tag: basque-glue
task: epec_koref_bin
dataset_path: orai-nlp/basqueGLUE
dataset_name: coref
......
group: basque-glue
tag: basque-glue
task: qnlieu
dataset_path: orai-nlp/basqueGLUE
dataset_name: qnli
......
group: basque-glue
tag: basque-glue
task: vaxx_stance
dataset_path: orai-nlp/basqueGLUE
dataset_name: vaxx
......
group: basque-glue
tag: basque-glue
task: wiceu
dataset_path: orai-nlp/basqueGLUE
dataset_name: wic
......
......@@ -5,7 +5,7 @@ task:
- bbh_cot_fewshot_date_understanding
- bbh_cot_fewshot_disambiguation_qa
- bbh_cot_fewshot_dyck_languages
- bbh_cot_fewshot_formal_languages
- bbh_cot_fewshot_formal_fallacies
- bbh_cot_fewshot_geometric_shapes
- bbh_cot_fewshot_hyperbaton
- bbh_cot_fewshot_logical_deduction_five_objects
......
group: bertaqa
tag: bertaqa
dataset_path: HiTZ/BertaQA
dataset_name: null
validation_split: null
......
group:
tag:
- inverse_scaling_mc
output_type: multiple_choice
test_split: train
......
# | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
# | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281|
# | | |none | 0|acc_norm|0.4476|± |0.0281|
# |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6273|± |0.0096|
# | | |none | 0|acc |0.6210|± |0.0095|
# | - inverse_scaling_neqa | 0|none | 0|acc |0.5300|± |0.0289|
# | | |none | 0|acc_norm|0.5300|± |0.0289|
# | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9367|± |0.0141|
# | | |none | 0|acc_norm|0.9367|± |0.0141|
# | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7178|± |0.0150|
# | | |none | 0|acc_norm|0.7178|± |0.0150|
# | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3786|± |0.0239|
# | | |none | 0|acc_norm|0.4126|± |0.0243|
# | Groups |Version|Filter|n-shot| Metric |Value | |Stderr|
# |------------------|-------|------|-----:|--------|-----:|---|-----:|
# |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6273|± |0.0096|
# | | |none | 0|acc |0.6210|± |0.0095|
# hf (pretrained=facebook/opt-2.7b,add_bos_token=True,dtype=float32), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (32)
# | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
# | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281|
# | | |none | 0|acc_norm|0.4476|± |0.0281|
# |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6291|± |0.0095|
# | | |none | 0|acc |0.6219|± |0.0095|
# | - inverse_scaling_neqa | 0|none | 0|acc |0.5267|± |0.0289|
# | | |none | 0|acc_norm|0.5267|± |0.0289|
# | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9433|± |0.0134|
# | | |none | 0|acc_norm|0.9433|± |0.0134|
# | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7200|± |0.0150|
# | | |none | 0|acc_norm|0.7200|± |0.0150|
# | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3762|± |0.0239|
# | | |none | 0|acc_norm|0.4150|± |0.0243|
# | Groups |Version|Filter|n-shot| Metric |Value | |Stderr|
# |------------------|-------|------|-----:|--------|-----:|---|-----:|
# |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6291|± |0.0095|
# | | |none | 0|acc |0.6219|± |0.0095|
group: leaderboard_bbh
dataset_path: SaylorTwift/bbh
output_type: multiple_choice
test_split: test
......
group: leaderboard_bbh
task:
- leaderboard_bbh_boolean_expressions
- leaderboard_bbh_causal_judgement
- leaderboard_bbh_date_understanding
- leaderboard_bbh_disambiguation_qa
- leaderboard_bbh_formal_fallacies
- leaderboard_bbh_geometric_shapes
- leaderboard_bbh_hyperbaton
- leaderboard_bbh_logical_deduction_five_objects
- leaderboard_bbh_logical_deduction_seven_objects
- leaderboard_bbh_logical_deduction_three_objects
- leaderboard_bbh_movie_recommendation
- leaderboard_bbh_navigate
- leaderboard_bbh_object_counting
- leaderboard_bbh_penguins_in_a_table
- leaderboard_bbh_reasoning_about_colored_objects
- leaderboard_bbh_ruin_names
- leaderboard_bbh_salient_translation_error_detection
- leaderboard_bbh_snarks
- leaderboard_bbh_sports_understanding
- leaderboard_bbh_temporal_sequences
- leaderboard_bbh_tracking_shuffled_objects_five_objects
- leaderboard_bbh_tracking_shuffled_objects_seven_objects
- leaderboard_bbh_tracking_shuffled_objects_three_objects
- leaderboard_bbh_web_of_lies
group: leaderboard_gpqa
task:
- leaderboard_gpqa_diamond
- leaderboard_gpqa_extended
- leaderboard_gpqa_main
dataset_path: Idavidrein/gpqa
group: leaderboard_gpqa
output_type: multiple_choice
process_docs: !function utils.process_docs
training_split: train
......
group: leaderboard_instruction_following
task:
- leaderboard_ifeval
task: leaderboard_ifeval
group: leaderboard_instruction_following
dataset_path: wis-k/instruction-following-eval
dataset_name: null
output_type: generate_until
......
group: leaderboard_math_hard
task:
- leaderboard_math_algebra_hard
- leaderboard_math_counting_and_prob_hard
- leaderboard_math_geometry_hard
- leaderboard_math_intermediate_algebra_hard
- leaderboard_math_num_theory_hard
- leaderboard_math_prealgebra_hard
- leaderboard_math_precalculus_hard
group:
- leaderboard_math_hard
dataset_path: lighteval/MATH-Hard
process_docs: !function utils.process_docs
output_type: generate_until
......
group: leaderboard_musr
task:
- leaderboard_musr_murder_mysteries
- leaderboard_musr_object_placements
- leaderboard_musr_team_allocation
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment