Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b2c090cc
Unverified
Commit
b2c090cc
authored
Jan 22, 2025
by
Minho Ryu
Committed by
GitHub
Jan 21, 2025
Browse files
aggregate by group (total and categories) (#2643)
parent
ed9c6fc8
Changes
204
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
70 additions
and
19 deletions
+70
-19
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml
...direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml
...direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml
...val/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml
...rd/kmmlu_direct_hard_political_science_and_sociology.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml
...tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml
...ks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml
...kmmlu_direct_hard_railway_and_automotive_engineering.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml
...asks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml
...irect_hard/kmmlu_direct_hard_refrigerating_machinery.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml
...s/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml
...l/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml
+2
-1
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml
...rect_hard_telecommunications_and_wireless_technology.yaml
+2
-1
lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml
lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml
+0
-6
lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml
lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml
+11
-0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml
+8
-0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml
+8
-0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml
+8
-0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml
+8
-0
lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml
lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml
+1
-0
No files found.
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml
View file @
b2c090cc
dataset_name
:
math
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_math
task
:
kmmlu_direct_hard_math
tag
:
kmmlu_direct_hard_stem_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml
View file @
b2c090cc
dataset_name
:
mechanical_engineering
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_mechanical_engineering
task
:
kmmlu_direct_hard_mechanical_engineering
tag
:
kmmlu_direct_hard_stem_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml
View file @
b2c090cc
dataset_name
:
nondestructive_testing
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_nondestructive_testing
task
:
kmmlu_direct_hard_nondestructive_testing
tag
:
kmmlu_direct_hard_applied_science_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml
View file @
b2c090cc
dataset_name
:
patent
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_patent
task
:
kmmlu_direct_hard_patent
tag
:
kmmlu_direct_hard_other_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml
View file @
b2c090cc
dataset_name
:
political_science_and_sociology
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_political_science_and_sociology
task
:
kmmlu_direct_hard_political_science_and_sociology
tag
:
kmmlu_direct_hard_humss_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml
View file @
b2c090cc
dataset_name
:
psychology
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_psychology
task
:
kmmlu_direct_hard_psychology
tag
:
kmmlu_direct_hard_humss_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml
View file @
b2c090cc
dataset_name
:
public_safety
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_public_safety
task
:
kmmlu_direct_hard_public_safety
tag
:
kmmlu_direct_hard_other_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml
View file @
b2c090cc
dataset_name
:
railway_and_automotive_engineering
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_railway_and_automotive_engineering
task
:
kmmlu_direct_hard_railway_and_automotive_engineering
tag
:
kmmlu_direct_hard_applied_science_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml
View file @
b2c090cc
dataset_name
:
real_estate
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_real_estate
task
:
kmmlu_direct_hard_real_estate
tag
:
kmmlu_direct_hard_other_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml
View file @
b2c090cc
dataset_name
:
refrigerating_machinery
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_refrigerating_machinery
task
:
kmmlu_direct_hard_refrigerating_machinery
tag
:
kmmlu_direct_hard_other_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml
View file @
b2c090cc
dataset_name
:
social_welfare
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_social_welfare
task
:
kmmlu_direct_hard_social_welfare
tag
:
kmmlu_direct_hard_humss_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml
View file @
b2c090cc
dataset_name
:
taxation
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_taxation
task
:
kmmlu_direct_hard_taxation
tag
:
kmmlu_direct_hard_humss_tasks
lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml
View file @
b2c090cc
dataset_name
:
telecommunications_and_wireless_technology
include
:
_direct_hard_kmmlu_yaml
task
:
kmmlu_hard_direct_telecommunications_and_wireless_technology
task
:
kmmlu_direct_hard_telecommunications_and_wireless_technology
tag
:
kmmlu_direct_hard_applied_science_tasks
lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml
View file @
b2c090cc
tag:
- kmmlu
- kmmlu_hard
dataset_path: HAERAE-HUB/KMMLU-HARD
output_type: multiple_choice
test_split: test
...
...
@@ -12,8 +9,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 2.0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard.yaml
0 → 100644
View file @
b2c090cc
group
:
kmmlu_hard
task
:
-
kmmlu_hard_stem
-
kmmlu_hard_other
-
kmmlu_hard_applied_science
-
kmmlu_hard_humss
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
2.0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_applied_science.yaml
0 → 100644
View file @
b2c090cc
group
:
kmmlu_hard_applied_science
task
:
-
kmmlu_hard_applied_science_tasks
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
2.0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_humss.yaml
0 → 100644
View file @
b2c090cc
group
:
kmmlu_hard_humss
task
:
-
kmmlu_hard_humss_tasks
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
2.0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_other.yaml
0 → 100644
View file @
b2c090cc
group
:
kmmlu_hard_other
task
:
-
kmmlu_hard_other_tasks
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
2.0
lm_eval/tasks/kmmlu/hard/_kmmlu_hard_stem.yaml
0 → 100644
View file @
b2c090cc
group
:
kmmlu_hard_stem
task
:
-
kmmlu_hard_stem_tasks
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
2.0
lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml
View file @
b2c090cc
dataset_name
:
accounting
include
:
_hard_kmmlu_yaml
task
:
kmmlu_hard_accounting
tag
:
kmmlu_hard_humss_tasks
Prev
1
…
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment