Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
5b64fb58
Commit
5b64fb58
authored
Jun 10, 2024
by
lintangsutawika
Browse files
update aggregate_metric arg
parent
9fa3b3f4
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
128 additions
and
82 deletions
+128
-82
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+21
-14
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
+14
-9
lm_eval/tasks/mmlu/default/_mmlu.yaml
lm_eval/tasks/mmlu/default/_mmlu.yaml
+3
-2
lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
+5
-3
lm_eval/tasks/mmlu/default/_mmlu_other.yaml
lm_eval/tasks/mmlu/default/_mmlu_other.yaml
+5
-3
lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
+5
-3
lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
+5
-3
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
+14
-9
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
+14
-9
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
+14
-9
lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
+14
-9
lm_eval/tasks/mmlu/generative/_mmlu.yaml
lm_eval/tasks/mmlu/generative/_mmlu.yaml
+14
-9
No files found.
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
View file @
5b64fb58
...
...
@@ -4,8 +4,9 @@ task:
# ANLI R1
-
group
:
anli_r1_flan
group_alias
:
ANLI R1
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
anli_r1
task_alias
:
prompt-0
...
...
@@ -55,8 +56,9 @@ task:
# ANLI R2
-
group
:
anli_r2_flan
group_alias
:
ANLI R2
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
anli_r2
task_alias
:
prompt-0
...
...
@@ -106,8 +108,9 @@ task:
# ANLI R3
-
group
:
anli_r3_flan
group_alias
:
ANLI R3
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
anli_r3
task_alias
:
prompt-0
...
...
@@ -157,8 +160,9 @@ task:
# Arc Easy
-
group
:
arc_easy_flan
group_alias
:
Arc Easy
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
arc_easy
task_alias
:
prompt-0
...
...
@@ -198,8 +202,9 @@ task:
# Arc Challenge
-
group
:
arc_challenge_flan
group_alias
:
Arc Challenge
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
arc_challenge
task_alias
:
prompt-0
...
...
@@ -239,8 +244,9 @@ task:
# BoolQ
-
group
:
boolq_flan
group_alias
:
BoolQ
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
boolq
task_alias
:
prompt-0
...
...
@@ -295,8 +301,9 @@ task:
# RTE
-
group
:
rte_flan
group_alias
:
RTE
group_config
:
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
task
:
-
task
:
rte
task_alias
:
prompt-0
...
...
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,24 +4,29 @@ task:
-
group
:
stem
task
:
-
mmlu_continuation_stem
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_continuation_other
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_continuation_social_sciences
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_continuation_humanities
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
True
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/default/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,7 +4,8 @@ task:
-
mmlu_other
-
mmlu_social_sciences
-
mmlu_humanities
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
View file @
5b64fb58
...
...
@@ -2,6 +2,8 @@ group: mmlu_humanities
group_alias
:
humanities
task
:
-
mmlu_humanities_tasks
aggregate_metric
:
True
weight_by_size
:
True
version
:
1
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/default/_mmlu_other.yaml
View file @
5b64fb58
...
...
@@ -2,6 +2,8 @@ group: mmlu_other
group_alias
:
other
task
:
-
mmlu_other_tasks
aggregate_metric
:
True
weight_by_size
:
True
version
:
1
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
\ No newline at end of file
lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
View file @
5b64fb58
...
...
@@ -2,6 +2,8 @@ group: mmlu_social_sciences
group_alias
:
social sciences
task
:
-
mmlu_social_sciences_tasks
aggregate_metric
:
True
weight_by_size
:
True
version
:
1
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
\ No newline at end of file
lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
View file @
5b64fb58
...
...
@@ -2,6 +2,8 @@ group: mmlu_stem
group_alias
:
stem
task
:
-
mmlu_stem_tasks
aggregate_metric
:
True
weight_by_size
:
True
version
:
1
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
\ No newline at end of file
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,24 +4,29 @@ task:
-
group
:
stem
task
:
-
mmlu_flan_cot_fewshot_stem
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_flan_cot_fewshot_other
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_flan_cot_fewshot_social_sciences
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_flan_cot_fewshot_humanities
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
True
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,24 +4,29 @@ task:
-
group
:
stem
task
:
-
mmlu_flan_cot_zeroshot_stem
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_flan_cot_zeroshot_other
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_flan_cot_zeroshot_social_sciences
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_flan_cot_zeroshot_humanities
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
True
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,24 +4,29 @@ task:
-
group
:
stem
task
:
-
mmlu_flan_n_shot_generative_stem
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_flan_n_shot_generative_other
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_flan_n_shot_generative_social_sciences
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_flan_n_shot_generative_humanities
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
True
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,24 +4,29 @@ task:
-
group
:
stem
task
:
-
mmlu_flan_n_shot_loglikelihood_stem
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_flan_n_shot_loglikelihood_other
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_flan_n_shot_loglikelihood_social_sciences
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_flan_n_shot_loglikelihood_humanities
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
True
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu/generative/_mmlu.yaml
View file @
5b64fb58
...
...
@@ -4,24 +4,29 @@ task:
-
group
:
stem
task
:
-
mmlu_stem_generative
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_other_generative
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_social_sciences_generative
aggregate_metric
:
True
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_humanities_generative
aggregate_metric
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric
:
True
weight_by_size
:
True
metadata
:
version
:
1
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment