Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
44a602ab
"cacheflow/vscode:/vscode.git/clone" did not exist on "7addca5935c83806429d7ec557999a505e6f6a35"
Commit
44a602ab
authored
Jun 25, 2024
by
haileyschoelkopf
Browse files
add many explicit group configs
parent
c9801daf
Changes
69
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
48 additions
and
34 deletions
+48
-34
lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
+6
-0
lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+1
-1
lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml
lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml
+11
-0
lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
+0
-1
lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
+0
-1
lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
+0
-1
lm_eval/tasks/lambada/lambada_openai.yaml
lm_eval/tasks/lambada/lambada_openai.yaml
+1
-1
lm_eval/tasks/lambada/lambada_standard.yaml
lm_eval/tasks/lambada/lambada_standard.yaml
+1
-1
lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+1
-1
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+1
-1
lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
+1
-1
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
+5
-5
lm_eval/tasks/mmlu/default/_mmlu.yaml
lm_eval/tasks/mmlu/default/_mmlu.yaml
+1
-1
lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
+1
-1
lm_eval/tasks/mmlu/default/_mmlu_other.yaml
lm_eval/tasks/mmlu/default/_mmlu_other.yaml
+1
-1
lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
+1
-1
lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
+1
-1
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
+5
-5
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
+5
-5
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
+5
-5
No files found.
lm_eval/tasks/hendrycks_math/hendrycks_math.yaml
View file @
44a602ab
...
@@ -7,3 +7,9 @@ task:
...
@@ -7,3 +7,9 @@ task:
-
hendrycks_math_num_theory
-
hendrycks_math_num_theory
-
hendrycks_math_prealgebra
-
hendrycks_math_prealgebra
-
hendrycks_math_precalc
-
hendrycks_math_precalc
aggregate_metric_list
:
-
metric
:
exact_match
aggregation
:
mean
weight_by_size
:
true
metadata
:
version
:
1.0
lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
View file @
44a602ab
group
:
tag
:
-
math_word_problems
-
math_word_problems
task
:
hendrycks_math_algebra
task
:
hendrycks_math_algebra
dataset_path
:
EleutherAI/hendrycks_math
dataset_path
:
EleutherAI/hendrycks_math
...
...
lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml
0 → 100644
View file @
44a602ab
group
:
kormedmcqa
task
:
-
kormedmcqa_doctor
-
kormedmcqa_nurse
-
kormedmcqa_pharm
aggregate_metric_list
:
-
metric
:
exact_match
aggregation
:
mean
weight_by_size
:
true
metadata
:
version
:
0.0
lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml
View file @
44a602ab
group
:
kormedmcqa
task
:
kormedmcqa_doctor
task
:
kormedmcqa_doctor
dataset_path
:
sean0042/KorMedMCQA
dataset_path
:
sean0042/KorMedMCQA
dataset_name
:
doctor
dataset_name
:
doctor
...
...
lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml
View file @
44a602ab
group
:
kormedmcqa
task
:
kormedmcqa_nurse
task
:
kormedmcqa_nurse
dataset_path
:
sean0042/KorMedMCQA
dataset_path
:
sean0042/KorMedMCQA
dataset_name
:
nurse
dataset_name
:
nurse
...
...
lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml
View file @
44a602ab
group
:
kormedmcqa
task
:
kormedmcqa_pharm
task
:
kormedmcqa_pharm
dataset_path
:
sean0042/KorMedMCQA
dataset_path
:
sean0042/KorMedMCQA
dataset_name
:
pharm
dataset_name
:
pharm
...
...
lm_eval/tasks/lambada/lambada_openai.yaml
View file @
44a602ab
group
:
tag
:
-
lambada
-
lambada
task
:
lambada_openai
task
:
lambada_openai
dataset_path
:
EleutherAI/lambada_openai
dataset_path
:
EleutherAI/lambada_openai
...
...
lm_eval/tasks/lambada/lambada_standard.yaml
View file @
44a602ab
group
:
tag
:
-
lambada
-
lambada
task
:
lambada_standard
task
:
lambada_standard
dataset_path
:
lambada
dataset_path
:
lambada
...
...
lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
View file @
44a602ab
group
:
tag
:
-
lambada_cloze
-
lambada_cloze
task
:
lambada_openai_cloze_yaml
task
:
lambada_openai_cloze_yaml
dataset_path
:
EleutherAI/lambada_openai
dataset_path
:
EleutherAI/lambada_openai
...
...
lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
View file @
44a602ab
group
:
tag
:
-
lambada_cloze
-
lambada_cloze
task
:
lambada_standard_cloze_yaml
task
:
lambada_standard_cloze_yaml
dataset_path
:
lambada
dataset_path
:
lambada
...
...
lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
View file @
44a602ab
group
:
tag
:
-
lambada_multilingual
-
lambada_multilingual
task
:
lambada_openai_mt_en
task
:
lambada_openai_mt_en
dataset_path
:
EleutherAI/lambada_openai
dataset_path
:
EleutherAI/lambada_openai
...
...
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
View file @
44a602ab
...
@@ -4,28 +4,28 @@ task:
...
@@ -4,28 +4,28 @@ task:
-
group
:
stem
-
group
:
stem
task
:
task
:
-
mmlu_continuation_stem
-
mmlu_continuation_stem
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
other
-
group
:
other
task
:
task
:
-
mmlu_continuation_other
-
mmlu_continuation_other
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
social sciences
-
group
:
social sciences
task
:
task
:
-
mmlu_continuation_social_sciences
-
mmlu_continuation_social_sciences
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
humanities
-
group
:
humanities
task
:
task
:
-
mmlu_continuation_humanities
-
mmlu_continuation_humanities
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/default/_mmlu.yaml
View file @
44a602ab
...
@@ -4,7 +4,7 @@ task:
...
@@ -4,7 +4,7 @@ task:
-
mmlu_other
-
mmlu_other
-
mmlu_social_sciences
-
mmlu_social_sciences
-
mmlu_humanities
-
mmlu_humanities
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml
View file @
44a602ab
...
@@ -2,7 +2,7 @@ group: mmlu_humanities
...
@@ -2,7 +2,7 @@ group: mmlu_humanities
group_alias
:
humanities
group_alias
:
humanities
task
:
task
:
-
mmlu_humanities_tasks
-
mmlu_humanities_tasks
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/default/_mmlu_other.yaml
View file @
44a602ab
...
@@ -2,7 +2,7 @@ group: mmlu_other
...
@@ -2,7 +2,7 @@ group: mmlu_other
group_alias
:
other
group_alias
:
other
task
:
task
:
-
mmlu_other_tasks
-
mmlu_other_tasks
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml
View file @
44a602ab
...
@@ -2,7 +2,7 @@ group: mmlu_social_sciences
...
@@ -2,7 +2,7 @@ group: mmlu_social_sciences
group_alias
:
social sciences
group_alias
:
social sciences
task
:
task
:
-
mmlu_social_sciences_tasks
-
mmlu_social_sciences_tasks
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/default/_mmlu_stem.yaml
View file @
44a602ab
...
@@ -2,7 +2,7 @@ group: mmlu_stem
...
@@ -2,7 +2,7 @@ group: mmlu_stem
group_alias
:
stem
group_alias
:
stem
task
:
task
:
-
mmlu_stem_tasks
-
mmlu_stem_tasks
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
View file @
44a602ab
...
@@ -4,28 +4,28 @@ task:
...
@@ -4,28 +4,28 @@ task:
-
group
:
stem
-
group
:
stem
task
:
task
:
-
mmlu_flan_cot_fewshot_stem
-
mmlu_flan_cot_fewshot_stem
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
other
-
group
:
other
task
:
task
:
-
mmlu_flan_cot_fewshot_other
-
mmlu_flan_cot_fewshot_other
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
social sciences
-
group
:
social sciences
task
:
task
:
-
mmlu_flan_cot_fewshot_social_sciences
-
mmlu_flan_cot_fewshot_social_sciences
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
humanities
-
group
:
humanities
task
:
task
:
-
mmlu_flan_cot_fewshot_humanities
-
mmlu_flan_cot_fewshot_humanities
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml
View file @
44a602ab
...
@@ -4,28 +4,28 @@ task:
...
@@ -4,28 +4,28 @@ task:
-
group
:
stem
-
group
:
stem
task
:
task
:
-
mmlu_flan_cot_zeroshot_stem
-
mmlu_flan_cot_zeroshot_stem
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
other
-
group
:
other
task
:
task
:
-
mmlu_flan_cot_zeroshot_other
-
mmlu_flan_cot_zeroshot_other
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
social sciences
-
group
:
social sciences
task
:
task
:
-
mmlu_flan_cot_zeroshot_social_sciences
-
mmlu_flan_cot_zeroshot_social_sciences
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
humanities
-
group
:
humanities
task
:
task
:
-
mmlu_flan_cot_zeroshot_humanities
-
mmlu_flan_cot_zeroshot_humanities
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml
View file @
44a602ab
...
@@ -4,28 +4,28 @@ task:
...
@@ -4,28 +4,28 @@ task:
-
group
:
stem
-
group
:
stem
task
:
task
:
-
mmlu_flan_n_shot_generative_stem
-
mmlu_flan_n_shot_generative_stem
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
other
-
group
:
other
task
:
task
:
-
mmlu_flan_n_shot_generative_other
-
mmlu_flan_n_shot_generative_other
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
social sciences
-
group
:
social sciences
task
:
task
:
-
mmlu_flan_n_shot_generative_social_sciences
-
mmlu_flan_n_shot_generative_social_sciences
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
-
group
:
humanities
-
group
:
humanities
task
:
task
:
-
mmlu_flan_n_shot_generative_humanities
-
mmlu_flan_n_shot_generative_humanities
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
aggregate_metric
:
aggregate_metric
_list
:
-
metric
:
acc
-
metric
:
acc
weight_by_size
:
True
weight_by_size
:
True
metadata
:
metadata
:
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment