Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
aa2dd2b5
"node.h" did not exist on "f959a475b7d7c5cd7aaac05fe4b944bbf5cd22b1"
Unverified
Commit
aa2dd2b5
authored
May 14, 2024
by
Fengzhe Zhou
Committed by
GitHub
May 14, 2024
Browse files
[Format] Add config lints (#892)
parent
3dbba119
Changes
648
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
359 additions
and
360 deletions
+359
-360
configs/summarizers/compassbench_v1_objective.py
configs/summarizers/compassbench_v1_objective.py
+25
-25
configs/summarizers/contamination.py
configs/summarizers/contamination.py
+59
-59
configs/summarizers/example.py
configs/summarizers/example.py
+1
-1
configs/summarizers/groups/MMLUArabic.py
configs/summarizers/groups/MMLUArabic.py
+20
-20
configs/summarizers/groups/cibench.py
configs/summarizers/groups/cibench.py
+1
-1
configs/summarizers/groups/cmmlu.py
configs/summarizers/groups/cmmlu.py
+72
-72
configs/summarizers/groups/lawbench.py
configs/summarizers/groups/lawbench.py
+20
-20
configs/summarizers/groups/leval.py
configs/summarizers/groups/leval.py
+1
-1
configs/summarizers/groups/lveval.py
configs/summarizers/groups/lveval.py
+38
-38
configs/summarizers/groups/mgsm.py
configs/summarizers/groups/mgsm.py
+3
-3
configs/summarizers/groups/scibench.py
configs/summarizers/groups/scibench.py
+3
-3
configs/summarizers/groups/teval.py
configs/summarizers/groups/teval.py
+0
-1
configs/summarizers/groups/xiezhi.py
configs/summarizers/groups/xiezhi.py
+1
-1
configs/summarizers/infinitebench.py
configs/summarizers/infinitebench.py
+2
-2
configs/summarizers/internlm2_keyset.py
configs/summarizers/internlm2_keyset.py
+1
-1
configs/summarizers/lawbench.py
configs/summarizers/lawbench.py
+1
-1
configs/summarizers/leaderboard.py
configs/summarizers/leaderboard.py
+6
-6
configs/summarizers/lveval.py
configs/summarizers/lveval.py
+103
-103
configs/summarizers/math_agent.py
configs/summarizers/math_agent.py
+1
-1
configs/summarizers/math_baseline.py
configs/summarizers/math_baseline.py
+1
-1
No files found.
configs/summarizers/compassbench_v1_objective.py
View file @
aa2dd2b5
...
...
@@ -115,36 +115,36 @@ agent_summary_groups = [
other_summary_groups
=
[
{
"
name
"
:
"
average_cn
"
,
"
subsets
"
:
[
[
"
language_zh_perf_4_and_non_mcq
"
,
"
naive_average
"
],
[
"
knowledge_cn
"
,
"
perf_4
"
],
[
"
reasonbench_cn_circular
"
,
"
perf_circular
"
],
[
"
math_perf_4_and_fill_in_blank_cn
"
,
"
naive_average
"
],
[
"
code_cn
"
,
"
naive_average
"
],
[
"
agent_cn
"
,
"
naive_average
"
],
'
name
'
:
'
average_cn
'
,
'
subsets
'
:
[
[
'
language_zh_perf_4_and_non_mcq
'
,
'
naive_average
'
],
[
'
knowledge_cn
'
,
'
perf_4
'
],
[
'
reasonbench_cn_circular
'
,
'
perf_circular
'
],
[
'
math_perf_4_and_fill_in_blank_cn
'
,
'
naive_average
'
],
[
'
code_cn
'
,
'
naive_average
'
],
[
'
agent_cn
'
,
'
naive_average
'
],
],
},
{
"
name
"
:
"
average_en
"
,
"
subsets
"
:
[
[
"
language_en_perf_4_and_non_mcq
"
,
"
naive_average
"
],
[
"
compassbench_v1_knowledge-mixed-cloze_en
"
,
"
score
"
],
[
"
reasonbench_en_circular
"
,
"
perf_circular
"
],
[
"
math_perf_4_and_fill_in_blank_en
"
,
"
naive_average
"
],
[
"
code_en
"
,
"
naive_average
"
],
[
"
agent_en
"
,
"
naive_average
"
],
'
name
'
:
'
average_en
'
,
'
subsets
'
:
[
[
'
language_en_perf_4_and_non_mcq
'
,
'
naive_average
'
],
[
'
compassbench_v1_knowledge-mixed-cloze_en
'
,
'
score
'
],
[
'
reasonbench_en_circular
'
,
'
perf_circular
'
],
[
'
math_perf_4_and_fill_in_blank_en
'
,
'
naive_average
'
],
[
'
code_en
'
,
'
naive_average
'
],
[
'
agent_en
'
,
'
naive_average
'
],
],
},
{
"
name
"
:
"
average
"
,
"
subsets
"
:
[
[
"
language_perf_4_and_non_mcq
"
,
"
naive_average
"
],
[
"
knowledge_perf_4_and_cloze
"
,
"
naive_average
"
],
[
"
reasonbench
"
,
"
perf_circular
"
],
[
"
math_perf_4_and_fill_in_blank
"
,
"
naive_average
"
],
[
"
code
"
,
"
naive_average
"
],
[
"
agent
"
,
"
naive_average
"
],
'
name
'
:
'
average
'
,
'
subsets
'
:
[
[
'
language_perf_4_and_non_mcq
'
,
'
naive_average
'
],
[
'
knowledge_perf_4_and_cloze
'
,
'
naive_average
'
],
[
'
reasonbench
'
,
'
perf_circular
'
],
[
'
math_perf_4_and_fill_in_blank
'
,
'
naive_average
'
],
[
'
code
'
,
'
naive_average
'
],
[
'
agent
'
,
'
naive_average
'
],
],
},
]
...
...
@@ -223,5 +223,5 @@ summarizer = dict(
[
'plugin_eval-mus-p10_one_review_zh'
,
'naive_average'
],
[
'plugin_eval-mus-p10_one_review'
,
'naive_average'
],
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/contamination.py
View file @
aa2dd2b5
...
...
@@ -60,63 +60,63 @@ ceval_category_weights = {
}
mmlu_category_weights
=
{
"
business_ethics
"
:
{
"
accuracy - clean
"
:
44
,
"
accuracy - input contaminated
"
:
16
,
"
accuracy - input-and-label contaminated
"
:
38
,
"
accuracy - not labeled
"
:
1
},
"
security_studies
"
:
{
"
accuracy - clean
"
:
188
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
47
,
"
accuracy - not labeled
"
:
0
},
"
high_school_us_history
"
:
{
"
accuracy - clean
"
:
42
,
"
accuracy - input contaminated
"
:
0
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
161
},
"
moral_disputes
"
:
{
"
accuracy - clean
"
:
105
,
"
accuracy - input contaminated
"
:
13
,
"
accuracy - input-and-label contaminated
"
:
168
,
"
accuracy - not labeled
"
:
59
},
"
philosophy
"
:
{
"
accuracy - clean
"
:
81
,
"
accuracy - input contaminated
"
:
11
,
"
accuracy - input-and-label contaminated
"
:
187
,
"
accuracy - not labeled
"
:
31
},
"
public_relations
"
:
{
"
accuracy - clean
"
:
75
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
26
,
"
accuracy - not labeled
"
:
0
},
"
high_school_microeconomics
"
:
{
"
accuracy - clean
"
:
82
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
146
,
"
accuracy - not labeled
"
:
0
},
"
human_sexuality
"
:
{
"
accuracy - clean
"
:
108
,
"
accuracy - input contaminated
"
:
3
,
"
accuracy - input-and-label contaminated
"
:
15
,
"
accuracy - not labeled
"
:
4
},
"
professional_accounting
"
:
{
"
accuracy - clean
"
:
88
,
"
accuracy - input contaminated
"
:
40
,
"
accuracy - input-and-label contaminated
"
:
152
,
"
accuracy - not labeled
"
:
1
},
"
high_school_government_and_politics
"
:
{
"
accuracy - clean
"
:
104
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
82
,
"
accuracy - not labeled
"
:
0
},
"
sociology
"
:
{
"
accuracy - clean
"
:
105
,
"
accuracy - input contaminated
"
:
4
,
"
accuracy - input-and-label contaminated
"
:
91
,
"
accuracy - not labeled
"
:
0
},
"
conceptual_physics
"
:
{
"
accuracy - clean
"
:
79
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
147
,
"
accuracy - not labeled
"
:
0
},
"
human_aging
"
:
{
"
accuracy - clean
"
:
208
,
"
accuracy - input contaminated
"
:
1
,
"
accuracy - input-and-label contaminated
"
:
13
,
"
accuracy - not labeled
"
:
0
},
"
high_school_psychology
"
:
{
"
accuracy - clean
"
:
108
,
"
accuracy - input contaminated
"
:
26
,
"
accuracy - input-and-label contaminated
"
:
162
,
"
accuracy - not labeled
"
:
248
},
"
jurisprudence
"
:
{
"
accuracy - clean
"
:
59
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
43
,
"
accuracy - not labeled
"
:
0
},
"
moral_scenarios
"
:
{
"
accuracy - clean
"
:
320
,
"
accuracy - input contaminated
"
:
0
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
574
},
"
college_medicine
"
:
{
"
accuracy - clean
"
:
107
,
"
accuracy - input contaminated
"
:
16
,
"
accuracy - input-and-label contaminated
"
:
44
,
"
accuracy - not labeled
"
:
5
},
"
high_school_world_history
"
:
{
"
accuracy - clean
"
:
61
,
"
accuracy - input contaminated
"
:
2
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
173
},
"
virology
"
:
{
"
accuracy - clean
"
:
104
,
"
accuracy - input contaminated
"
:
3
,
"
accuracy - input-and-label contaminated
"
:
58
,
"
accuracy - not labeled
"
:
0
},
"
high_school_statistics
"
:
{
"
accuracy - clean
"
:
96
,
"
accuracy - input contaminated
"
:
43
,
"
accuracy - input-and-label contaminated
"
:
76
,
"
accuracy - not labeled
"
:
0
},
"
nutrition
"
:
{
"
accuracy - clean
"
:
172
,
"
accuracy - input contaminated
"
:
11
,
"
accuracy - input-and-label contaminated
"
:
98
,
"
accuracy - not labeled
"
:
24
},
"
abstract_algebra
"
:
{
"
accuracy - clean
"
:
84
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
7
,
"
accuracy - not labeled
"
:
0
},
"
high_school_geography
"
:
{
"
accuracy - clean
"
:
91
,
"
accuracy - input contaminated
"
:
1
,
"
accuracy - input-and-label contaminated
"
:
105
,
"
accuracy - not labeled
"
:
0
},
"
econometrics
"
:
{
"
accuracy - clean
"
:
62
,
"
accuracy - input contaminated
"
:
13
,
"
accuracy - input-and-label contaminated
"
:
38
,
"
accuracy - not labeled
"
:
0
},
"
marketing
"
:
{
"
accuracy - clean
"
:
115
,
"
accuracy - input contaminated
"
:
15
,
"
accuracy - input-and-label contaminated
"
:
101
,
"
accuracy - not labeled
"
:
2
},
"
high_school_chemistry
"
:
{
"
accuracy - clean
"
:
108
,
"
accuracy - input contaminated
"
:
25
,
"
accuracy - input-and-label contaminated
"
:
69
,
"
accuracy - not labeled
"
:
0
},
"
prehistory
"
:
{
"
accuracy - clean
"
:
154
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
107
,
"
accuracy - not labeled
"
:
57
},
"
college_physics
"
:
{
"
accuracy - clean
"
:
25
,
"
accuracy - input contaminated
"
:
20
,
"
accuracy - input-and-label contaminated
"
:
57
,
"
accuracy - not labeled
"
:
0
},
"
management
"
:
{
"
accuracy - clean
"
:
35
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
62
,
"
accuracy - not labeled
"
:
0
},
"
college_biology
"
:
{
"
accuracy - clean
"
:
91
,
"
accuracy - input contaminated
"
:
12
,
"
accuracy - input-and-label contaminated
"
:
40
,
"
accuracy - not labeled
"
:
0
},
"
high_school_biology
"
:
{
"
accuracy - clean
"
:
128
,
"
accuracy - input contaminated
"
:
17
,
"
accuracy - input-and-label contaminated
"
:
135
,
"
accuracy - not labeled
"
:
29
},
"
high_school_physics
"
:
{
"
accuracy - clean
"
:
42
,
"
accuracy - input contaminated
"
:
28
,
"
accuracy - input-and-label contaminated
"
:
80
,
"
accuracy - not labeled
"
:
0
},
"
logical_fallacies
"
:
{
"
accuracy - clean
"
:
133
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
24
,
"
accuracy - not labeled
"
:
0
},
"
medical_genetics
"
:
{
"
accuracy - clean
"
:
49
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
43
,
"
accuracy - not labeled
"
:
1
},
"
machine_learning
"
:
{
"
accuracy - clean
"
:
71
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
32
,
"
accuracy - not labeled
"
:
0
},
"
professional_law
"
:
{
"
accuracy - clean
"
:
401
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
5
,
"
accuracy - not labeled
"
:
1119
},
"
professional_psychology
"
:
{
"
accuracy - clean
"
:
265
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
27
,
"
accuracy - not labeled
"
:
310
},
"
global_facts
"
:
{
"
accuracy - clean
"
:
89
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
5
,
"
accuracy - not labeled
"
:
0
},
"
us_foreign_policy
"
:
{
"
accuracy - clean
"
:
71
,
"
accuracy - input contaminated
"
:
3
,
"
accuracy - input-and-label contaminated
"
:
25
,
"
accuracy - not labeled
"
:
0
},
"
international_law
"
:
{
"
accuracy - clean
"
:
73
,
"
accuracy - input contaminated
"
:
1
,
"
accuracy - input-and-label contaminated
"
:
46
,
"
accuracy - not labeled
"
:
0
},
"
clinical_knowledge
"
:
{
"
accuracy - clean
"
:
172
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
86
,
"
accuracy - not labeled
"
:
0
},
"
high_school_mathematics
"
:
{
"
accuracy - clean
"
:
178
,
"
accuracy - input contaminated
"
:
59
,
"
accuracy - input-and-label contaminated
"
:
32
,
"
accuracy - not labeled
"
:
0
},
"
high_school_computer_science
"
:
{
"
accuracy - clean
"
:
62
,
"
accuracy - input contaminated
"
:
7
,
"
accuracy - input-and-label contaminated
"
:
28
,
"
accuracy - not labeled
"
:
2
},
"
college_computer_science
"
:
{
"
accuracy - clean
"
:
68
,
"
accuracy - input contaminated
"
:
15
,
"
accuracy - input-and-label contaminated
"
:
15
,
"
accuracy - not labeled
"
:
1
},
"
electrical_engineering
"
:
{
"
accuracy - clean
"
:
75
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
61
,
"
accuracy - not labeled
"
:
0
},
"
college_mathematics
"
:
{
"
accuracy - clean
"
:
61
,
"
accuracy - input contaminated
"
:
13
,
"
accuracy - input-and-label contaminated
"
:
26
,
"
accuracy - not labeled
"
:
0
},
"
computer_security
"
:
{
"
accuracy - clean
"
:
55
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
36
,
"
accuracy - not labeled
"
:
0
},
"
high_school_macroeconomics
"
:
{
"
accuracy - clean
"
:
102
,
"
accuracy - input contaminated
"
:
14
,
"
accuracy - input-and-label contaminated
"
:
173
,
"
accuracy - not labeled
"
:
100
},
"
astronomy
"
:
{
"
accuracy - clean
"
:
112
,
"
accuracy - input contaminated
"
:
4
,
"
accuracy - input-and-label contaminated
"
:
35
,
"
accuracy - not labeled
"
:
0
},
"
college_chemistry
"
:
{
"
accuracy - clean
"
:
46
,
"
accuracy - input contaminated
"
:
19
,
"
accuracy - input-and-label contaminated
"
:
34
,
"
accuracy - not labeled
"
:
0
},
"
high_school_european_history
"
:
{
"
accuracy - clean
"
:
41
,
"
accuracy - input contaminated
"
:
0
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
123
},
"
miscellaneous
"
:
{
"
accuracy - clean
"
:
256
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
40
,
"
accuracy - not labeled
"
:
477
},
"
formal_logic
"
:
{
"
accuracy - clean
"
:
92
,
"
accuracy - input contaminated
"
:
12
,
"
accuracy - input-and-label contaminated
"
:
21
,
"
accuracy - not labeled
"
:
0
},
"
elementary_mathematics
"
:
{
"
accuracy - clean
"
:
155
,
"
accuracy - input contaminated
"
:
31
,
"
accuracy - input-and-label contaminated
"
:
103
,
"
accuracy - not labeled
"
:
88
},
"
world_religions
"
:
{
"
accuracy - clean
"
:
130
,
"
accuracy - input contaminated
"
:
4
,
"
accuracy - input-and-label contaminated
"
:
36
,
"
accuracy - not labeled
"
:
0
},
"
professional_medicine
"
:
{
"
accuracy - clean
"
:
191
,
"
accuracy - input contaminated
"
:
43
,
"
accuracy - input-and-label contaminated
"
:
1
,
"
accuracy - not labeled
"
:
36
},
"
anatomy
"
:
{
"
accuracy - clean
"
:
52
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
76
,
"
accuracy - not labeled
"
:
0
},
'
business_ethics
'
:
{
'
accuracy - clean
'
:
44
,
'
accuracy - input contaminated
'
:
16
,
'
accuracy - input-and-label contaminated
'
:
38
,
'
accuracy - not labeled
'
:
1
},
'
security_studies
'
:
{
'
accuracy - clean
'
:
188
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
47
,
'
accuracy - not labeled
'
:
0
},
'
high_school_us_history
'
:
{
'
accuracy - clean
'
:
42
,
'
accuracy - input contaminated
'
:
0
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
161
},
'
moral_disputes
'
:
{
'
accuracy - clean
'
:
105
,
'
accuracy - input contaminated
'
:
13
,
'
accuracy - input-and-label contaminated
'
:
168
,
'
accuracy - not labeled
'
:
59
},
'
philosophy
'
:
{
'
accuracy - clean
'
:
81
,
'
accuracy - input contaminated
'
:
11
,
'
accuracy - input-and-label contaminated
'
:
187
,
'
accuracy - not labeled
'
:
31
},
'
public_relations
'
:
{
'
accuracy - clean
'
:
75
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
26
,
'
accuracy - not labeled
'
:
0
},
'
high_school_microeconomics
'
:
{
'
accuracy - clean
'
:
82
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
146
,
'
accuracy - not labeled
'
:
0
},
'
human_sexuality
'
:
{
'
accuracy - clean
'
:
108
,
'
accuracy - input contaminated
'
:
3
,
'
accuracy - input-and-label contaminated
'
:
15
,
'
accuracy - not labeled
'
:
4
},
'
professional_accounting
'
:
{
'
accuracy - clean
'
:
88
,
'
accuracy - input contaminated
'
:
40
,
'
accuracy - input-and-label contaminated
'
:
152
,
'
accuracy - not labeled
'
:
1
},
'
high_school_government_and_politics
'
:
{
'
accuracy - clean
'
:
104
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
82
,
'
accuracy - not labeled
'
:
0
},
'
sociology
'
:
{
'
accuracy - clean
'
:
105
,
'
accuracy - input contaminated
'
:
4
,
'
accuracy - input-and-label contaminated
'
:
91
,
'
accuracy - not labeled
'
:
0
},
'
conceptual_physics
'
:
{
'
accuracy - clean
'
:
79
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
147
,
'
accuracy - not labeled
'
:
0
},
'
human_aging
'
:
{
'
accuracy - clean
'
:
208
,
'
accuracy - input contaminated
'
:
1
,
'
accuracy - input-and-label contaminated
'
:
13
,
'
accuracy - not labeled
'
:
0
},
'
high_school_psychology
'
:
{
'
accuracy - clean
'
:
108
,
'
accuracy - input contaminated
'
:
26
,
'
accuracy - input-and-label contaminated
'
:
162
,
'
accuracy - not labeled
'
:
248
},
'
jurisprudence
'
:
{
'
accuracy - clean
'
:
59
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
43
,
'
accuracy - not labeled
'
:
0
},
'
moral_scenarios
'
:
{
'
accuracy - clean
'
:
320
,
'
accuracy - input contaminated
'
:
0
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
574
},
'
college_medicine
'
:
{
'
accuracy - clean
'
:
107
,
'
accuracy - input contaminated
'
:
16
,
'
accuracy - input-and-label contaminated
'
:
44
,
'
accuracy - not labeled
'
:
5
},
'
high_school_world_history
'
:
{
'
accuracy - clean
'
:
61
,
'
accuracy - input contaminated
'
:
2
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
173
},
'
virology
'
:
{
'
accuracy - clean
'
:
104
,
'
accuracy - input contaminated
'
:
3
,
'
accuracy - input-and-label contaminated
'
:
58
,
'
accuracy - not labeled
'
:
0
},
'
high_school_statistics
'
:
{
'
accuracy - clean
'
:
96
,
'
accuracy - input contaminated
'
:
43
,
'
accuracy - input-and-label contaminated
'
:
76
,
'
accuracy - not labeled
'
:
0
},
'
nutrition
'
:
{
'
accuracy - clean
'
:
172
,
'
accuracy - input contaminated
'
:
11
,
'
accuracy - input-and-label contaminated
'
:
98
,
'
accuracy - not labeled
'
:
24
},
'
abstract_algebra
'
:
{
'
accuracy - clean
'
:
84
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
7
,
'
accuracy - not labeled
'
:
0
},
'
high_school_geography
'
:
{
'
accuracy - clean
'
:
91
,
'
accuracy - input contaminated
'
:
1
,
'
accuracy - input-and-label contaminated
'
:
105
,
'
accuracy - not labeled
'
:
0
},
'
econometrics
'
:
{
'
accuracy - clean
'
:
62
,
'
accuracy - input contaminated
'
:
13
,
'
accuracy - input-and-label contaminated
'
:
38
,
'
accuracy - not labeled
'
:
0
},
'
marketing
'
:
{
'
accuracy - clean
'
:
115
,
'
accuracy - input contaminated
'
:
15
,
'
accuracy - input-and-label contaminated
'
:
101
,
'
accuracy - not labeled
'
:
2
},
'
high_school_chemistry
'
:
{
'
accuracy - clean
'
:
108
,
'
accuracy - input contaminated
'
:
25
,
'
accuracy - input-and-label contaminated
'
:
69
,
'
accuracy - not labeled
'
:
0
},
'
prehistory
'
:
{
'
accuracy - clean
'
:
154
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
107
,
'
accuracy - not labeled
'
:
57
},
'
college_physics
'
:
{
'
accuracy - clean
'
:
25
,
'
accuracy - input contaminated
'
:
20
,
'
accuracy - input-and-label contaminated
'
:
57
,
'
accuracy - not labeled
'
:
0
},
'
management
'
:
{
'
accuracy - clean
'
:
35
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
62
,
'
accuracy - not labeled
'
:
0
},
'
college_biology
'
:
{
'
accuracy - clean
'
:
91
,
'
accuracy - input contaminated
'
:
12
,
'
accuracy - input-and-label contaminated
'
:
40
,
'
accuracy - not labeled
'
:
0
},
'
high_school_biology
'
:
{
'
accuracy - clean
'
:
128
,
'
accuracy - input contaminated
'
:
17
,
'
accuracy - input-and-label contaminated
'
:
135
,
'
accuracy - not labeled
'
:
29
},
'
high_school_physics
'
:
{
'
accuracy - clean
'
:
42
,
'
accuracy - input contaminated
'
:
28
,
'
accuracy - input-and-label contaminated
'
:
80
,
'
accuracy - not labeled
'
:
0
},
'
logical_fallacies
'
:
{
'
accuracy - clean
'
:
133
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
24
,
'
accuracy - not labeled
'
:
0
},
'
medical_genetics
'
:
{
'
accuracy - clean
'
:
49
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
43
,
'
accuracy - not labeled
'
:
1
},
'
machine_learning
'
:
{
'
accuracy - clean
'
:
71
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
32
,
'
accuracy - not labeled
'
:
0
},
'
professional_law
'
:
{
'
accuracy - clean
'
:
401
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
5
,
'
accuracy - not labeled
'
:
1119
},
'
professional_psychology
'
:
{
'
accuracy - clean
'
:
265
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
27
,
'
accuracy - not labeled
'
:
310
},
'
global_facts
'
:
{
'
accuracy - clean
'
:
89
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
5
,
'
accuracy - not labeled
'
:
0
},
'
us_foreign_policy
'
:
{
'
accuracy - clean
'
:
71
,
'
accuracy - input contaminated
'
:
3
,
'
accuracy - input-and-label contaminated
'
:
25
,
'
accuracy - not labeled
'
:
0
},
'
international_law
'
:
{
'
accuracy - clean
'
:
73
,
'
accuracy - input contaminated
'
:
1
,
'
accuracy - input-and-label contaminated
'
:
46
,
'
accuracy - not labeled
'
:
0
},
'
clinical_knowledge
'
:
{
'
accuracy - clean
'
:
172
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
86
,
'
accuracy - not labeled
'
:
0
},
'
high_school_mathematics
'
:
{
'
accuracy - clean
'
:
178
,
'
accuracy - input contaminated
'
:
59
,
'
accuracy - input-and-label contaminated
'
:
32
,
'
accuracy - not labeled
'
:
0
},
'
high_school_computer_science
'
:
{
'
accuracy - clean
'
:
62
,
'
accuracy - input contaminated
'
:
7
,
'
accuracy - input-and-label contaminated
'
:
28
,
'
accuracy - not labeled
'
:
2
},
'
college_computer_science
'
:
{
'
accuracy - clean
'
:
68
,
'
accuracy - input contaminated
'
:
15
,
'
accuracy - input-and-label contaminated
'
:
15
,
'
accuracy - not labeled
'
:
1
},
'
electrical_engineering
'
:
{
'
accuracy - clean
'
:
75
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
61
,
'
accuracy - not labeled
'
:
0
},
'
college_mathematics
'
:
{
'
accuracy - clean
'
:
61
,
'
accuracy - input contaminated
'
:
13
,
'
accuracy - input-and-label contaminated
'
:
26
,
'
accuracy - not labeled
'
:
0
},
'
computer_security
'
:
{
'
accuracy - clean
'
:
55
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
36
,
'
accuracy - not labeled
'
:
0
},
'
high_school_macroeconomics
'
:
{
'
accuracy - clean
'
:
102
,
'
accuracy - input contaminated
'
:
14
,
'
accuracy - input-and-label contaminated
'
:
173
,
'
accuracy - not labeled
'
:
100
},
'
astronomy
'
:
{
'
accuracy - clean
'
:
112
,
'
accuracy - input contaminated
'
:
4
,
'
accuracy - input-and-label contaminated
'
:
35
,
'
accuracy - not labeled
'
:
0
},
'
college_chemistry
'
:
{
'
accuracy - clean
'
:
46
,
'
accuracy - input contaminated
'
:
19
,
'
accuracy - input-and-label contaminated
'
:
34
,
'
accuracy - not labeled
'
:
0
},
'
high_school_european_history
'
:
{
'
accuracy - clean
'
:
41
,
'
accuracy - input contaminated
'
:
0
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
123
},
'
miscellaneous
'
:
{
'
accuracy - clean
'
:
256
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
40
,
'
accuracy - not labeled
'
:
477
},
'
formal_logic
'
:
{
'
accuracy - clean
'
:
92
,
'
accuracy - input contaminated
'
:
12
,
'
accuracy - input-and-label contaminated
'
:
21
,
'
accuracy - not labeled
'
:
0
},
'
elementary_mathematics
'
:
{
'
accuracy - clean
'
:
155
,
'
accuracy - input contaminated
'
:
31
,
'
accuracy - input-and-label contaminated
'
:
103
,
'
accuracy - not labeled
'
:
88
},
'
world_religions
'
:
{
'
accuracy - clean
'
:
130
,
'
accuracy - input contaminated
'
:
4
,
'
accuracy - input-and-label contaminated
'
:
36
,
'
accuracy - not labeled
'
:
0
},
'
professional_medicine
'
:
{
'
accuracy - clean
'
:
191
,
'
accuracy - input contaminated
'
:
43
,
'
accuracy - input-and-label contaminated
'
:
1
,
'
accuracy - not labeled
'
:
36
},
'
anatomy
'
:
{
'
accuracy - clean
'
:
52
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
76
,
'
accuracy - not labeled
'
:
0
},
}
...
...
@@ -166,7 +166,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights'
:
weights
,
}
)
for
dataset_abbr
,
subsets
in
mmlu_name_and_subsets
:
weights
=
{
f
'lukaemon_mmlu_
{
i
}
'
:
mmlu_category_weights
[
i
][
metric_name
]
for
i
in
subsets
}
subsets
=
[[
f
'lukaemon_mmlu_
{
i
}
'
,
metric_name
]
for
i
in
subsets
]
...
...
@@ -178,7 +178,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights'
:
weights
,
}
)
summary_groups
.
append
(
{
'name'
:
'hellaswag'
,
...
...
configs/summarizers/example.py
View file @
aa2dd2b5
...
...
@@ -14,5 +14,5 @@ with read_base():
from
.groups.mgsm
import
mgsm_summary_groups
summarizer
=
dict
(
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/groups/MMLUArabic.py
View file @
aa2dd2b5
sub_categories
=
{
'math'
:
[
'abstract_algebra'
,
'college_mathematics'
,
'elementary_mathematics'
,
'high_school_mathematics'
,
'high_school_statistics'
],
'health'
:
[
'anatomy'
,
'clinical_knowledge'
,
'college_medicine'
,
'human_aging'
,
'medical_genetics'
,
'nutrition'
,
'professional_medicine'
,
'virology'
],
'physics'
:
[
'astronomy'
,
'college_physics'
,
'conceptual_physics'
,
'high_school_physics'
],
'business'
:
[
'business_ethics'
,
'management'
,
'marketing'
],
'biology'
:
[
'college_biology'
,
'high_school_biology'
],
'chemistry'
:
[
'college_chemistry'
,
'high_school_chemistry'
],
'computer science'
:
[
'college_computer_science'
,
'computer_security'
,
'high_school_computer_science'
,
'machine_learning'
],
'economics'
:
[
'econometrics'
,
'high_school_macroeconomics'
,
'high_school_microeconomics'
],
'engineering'
:
[
'electrical_engineering'
],
'philosophy'
:
[
'formal_logic'
,
'logical_fallacies'
,
'moral_disputes'
,
'moral_scenarios'
,
'philosophy'
,
'world_religions'
],
'other'
:
[
'global_facts'
,
'miscellaneous'
,
'professional_accounting'
],
'history'
:
[
'high_school_european_history'
,
'high_school_us_history'
,
'high_school_world_history'
,
'prehistory'
],
'geography'
:
[
'high_school_geography'
],
'politics'
:
[
'high_school_government_and_politics'
,
'public_relations'
,
'security_studies'
,
'us_foreign_policy'
],
'psychology'
:
[
'high_school_psychology'
,
'professional_psychology'
],
'culture'
:
[
'human_sexuality'
,
'sociology'
],
'math'
:
[
'abstract_algebra'
,
'college_mathematics'
,
'elementary_mathematics'
,
'high_school_mathematics'
,
'high_school_statistics'
],
'health'
:
[
'anatomy'
,
'clinical_knowledge'
,
'college_medicine'
,
'human_aging'
,
'medical_genetics'
,
'nutrition'
,
'professional_medicine'
,
'virology'
],
'physics'
:
[
'astronomy'
,
'college_physics'
,
'conceptual_physics'
,
'high_school_physics'
],
'business'
:
[
'business_ethics'
,
'management'
,
'marketing'
],
'biology'
:
[
'college_biology'
,
'high_school_biology'
],
'chemistry'
:
[
'college_chemistry'
,
'high_school_chemistry'
],
'computer science'
:
[
'college_computer_science'
,
'computer_security'
,
'high_school_computer_science'
,
'machine_learning'
],
'economics'
:
[
'econometrics'
,
'high_school_macroeconomics'
,
'high_school_microeconomics'
],
'engineering'
:
[
'electrical_engineering'
],
'philosophy'
:
[
'formal_logic'
,
'logical_fallacies'
,
'moral_disputes'
,
'moral_scenarios'
,
'philosophy'
,
'world_religions'
],
'other'
:
[
'global_facts'
,
'miscellaneous'
,
'professional_accounting'
],
'history'
:
[
'high_school_european_history'
,
'high_school_us_history'
,
'high_school_world_history'
,
'prehistory'
],
'geography'
:
[
'high_school_geography'
],
'politics'
:
[
'high_school_government_and_politics'
,
'public_relations'
,
'security_studies'
,
'us_foreign_policy'
],
'psychology'
:
[
'high_school_psychology'
,
'professional_psychology'
],
'culture'
:
[
'human_sexuality'
,
'sociology'
],
'law'
:
[
'international_law'
,
'jurisprudence'
,
'professional_law'
]
}
categories
=
{
"
STEM
"
:
[
"
physics
"
,
"
chemistry
"
,
"
biology
"
,
"
computer science
"
,
"
math
"
,
"
engineering
"
],
"
humanities
"
:
[
"
history
"
,
"
philosophy
"
,
"
law
"
],
"
social_sciences
"
:
[
"
politics
"
,
"
culture
"
,
"
economics
"
,
"
geography
"
,
"
psychology
"
],
"
other
"
:
[
"
other
"
,
"
business
"
,
"
health
"
],
'
STEM
'
:
[
'
physics
'
,
'
chemistry
'
,
'
biology
'
,
'
computer science
'
,
'
math
'
,
'
engineering
'
],
'
humanities
'
:
[
'
history
'
,
'
philosophy
'
,
'
law
'
],
'
social_sciences
'
:
[
'
politics
'
,
'
culture
'
,
'
economics
'
,
'
geography
'
,
'
psychology
'
],
'
other
'
:
[
'
other
'
,
'
business
'
,
'
health
'
],
}
category2subject
=
{}
...
...
configs/summarizers/groups/cibench.py
View file @
aa2dd2b5
...
...
@@ -392,4 +392,4 @@ cibench_summary_groups.extend([
'subsets'
:
[
i
[:
2
]
for
i
in
cibench_math
],
'weights'
:
{
f
'
{
k
[
0
]
}
@
{
k
[
1
]
}
'
:
k
[
-
1
]
for
k
in
cibench_math
},
},
])
\ No newline at end of file
])
configs/summarizers/groups/cmmlu.py
View file @
aa2dd2b5
subcategories
=
{
"
agronomy
"
:
[
'other'
],
"
anatomy
"
:
[
'biology'
],
"
ancient_chinese
"
:
[
'linguistics'
,
'china specific'
],
"
arts
"
:
[
'arts'
],
"
astronomy
"
:
[
'physics'
],
"
business_ethics
"
:
[
'business'
],
"
chinese_civil_service_exam
"
:
[
'politics'
,
'china specific'
],
"
chinese_driving_rule
"
:
[
'other'
,
'china specific'
],
"
chinese_food_culture
"
:
[
'culture'
,
'china specific'
],
"
chinese_foreign_policy
"
:
[
'politics'
,
'china specific'
],
"
chinese_history
"
:[
'history'
,
'china specific'
],
"
chinese_literature
"
:
[
'literature'
,
'china specific'
],
"
chinese_teacher_qualification
"
:
[
'education'
,
'china specific'
],
"
college_actuarial_science
"
:[
'math'
],
"
college_education
"
:[
'education'
],
"
college_engineering_hydrology
"
:
[
'engineering'
],
"
college_law
"
:
[
'law'
],
"
college_mathematics
"
:
[
'math'
],
"
college_medical_statistics
"
:[
'statistics'
],
"
clinical_knowledge
"
:
[
'other'
],
"
college_medicine
"
:
[
'other'
],
"
computer_science
"
:
[
'computer science'
],
"
computer_security
"
:
[
'other'
],
"
conceptual_physics
"
:
[
'physics'
],
"
construction_project_management
"
:
[
'other'
,
'china specific'
],
"
economics
"
:
[
'economics'
],
"
education
"
:
[
'education'
],
"
elementary_chinese
"
:[
'linguistics'
,
'china specific'
],
"
elementary_commonsense
"
:[
'other'
,
'china specific'
],
"
elementary_information_and_technology
"
:
[
'other'
],
"
electrical_engineering
"
:
[
'engineering'
],
"
elementary_mathematics
"
:
[
'math'
],
"
ethnology
"
:
[
'culture'
,
'china specific'
],
"
food_science
"
:
[
'other'
],
"
genetics
"
:
[
'biology'
],
"
global_facts
"
:
[
'global'
],
"
high_school_biology
"
:
[
'biology'
],
"
high_school_chemistry
"
:
[
'chemistry'
],
"
high_school_geography
"
:
[
'geography'
],
"
high_school_mathematics
"
:
[
'math'
],
"
high_school_physics
"
:
[
'physics'
],
"
high_school_politics
"
:
[
'politics'
,
'china specific'
],
"
human_sexuality
"
:
[
'other'
],
"
international_law
"
:
[
'law'
],
"
journalism
"
:
[
'sociology'
],
"
jurisprudence
"
:
[
'law'
],
"
legal_and_moral_basis
"
:
[
'other'
],
"
logical
"
:
[
'philosophy'
],
"
machine_learning
"
:
[
'computer science'
],
"
management
"
:
[
'business'
],
"
marketing
"
:
[
'business'
],
"
marxist_theory
"
:
[
'philosophy'
],
"
modern_chinese
"
:
[
'linguistics'
,
'china specific'
],
"
nutrition
"
:
[
'other'
],
"
philosophy
"
:
[
'philosophy'
],
"
professional_accounting
"
:
[
'business'
],
"
professional_law
"
:
[
'law'
],
"
professional_medicine
"
:
[
'other'
],
"
professional_psychology
"
:
[
'psychology'
],
"
public_relations
"
:
[
'politics'
],
"
security_study
"
:
[
'politics'
],
"
sociology
"
:
[
'culture'
],
"
sports_science
"
:
[
'other'
],
"
traditional_chinese_medicine
"
:
[
'other'
,
'china specific'
],
"
virology
"
:
[
'biology'
],
"
world_history
"
:[
'history'
],
"
world_religions
"
:
[
'global'
],
'
agronomy
'
:
[
'other'
],
'
anatomy
'
:
[
'biology'
],
'
ancient_chinese
'
:
[
'linguistics'
,
'china specific'
],
'
arts
'
:
[
'arts'
],
'
astronomy
'
:
[
'physics'
],
'
business_ethics
'
:
[
'business'
],
'
chinese_civil_service_exam
'
:
[
'politics'
,
'china specific'
],
'
chinese_driving_rule
'
:
[
'other'
,
'china specific'
],
'
chinese_food_culture
'
:
[
'culture'
,
'china specific'
],
'
chinese_foreign_policy
'
:
[
'politics'
,
'china specific'
],
'
chinese_history
'
:[
'history'
,
'china specific'
],
'
chinese_literature
'
:
[
'literature'
,
'china specific'
],
'
chinese_teacher_qualification
'
:
[
'education'
,
'china specific'
],
'
college_actuarial_science
'
:[
'math'
],
'
college_education
'
:[
'education'
],
'
college_engineering_hydrology
'
:
[
'engineering'
],
'
college_law
'
:
[
'law'
],
'
college_mathematics
'
:
[
'math'
],
'
college_medical_statistics
'
:[
'statistics'
],
'
clinical_knowledge
'
:
[
'other'
],
'
college_medicine
'
:
[
'other'
],
'
computer_science
'
:
[
'computer science'
],
'
computer_security
'
:
[
'other'
],
'
conceptual_physics
'
:
[
'physics'
],
'
construction_project_management
'
:
[
'other'
,
'china specific'
],
'
economics
'
:
[
'economics'
],
'
education
'
:
[
'education'
],
'
elementary_chinese
'
:[
'linguistics'
,
'china specific'
],
'
elementary_commonsense
'
:[
'other'
,
'china specific'
],
'
elementary_information_and_technology
'
:
[
'other'
],
'
electrical_engineering
'
:
[
'engineering'
],
'
elementary_mathematics
'
:
[
'math'
],
'
ethnology
'
:
[
'culture'
,
'china specific'
],
'
food_science
'
:
[
'other'
],
'
genetics
'
:
[
'biology'
],
'
global_facts
'
:
[
'global'
],
'
high_school_biology
'
:
[
'biology'
],
'
high_school_chemistry
'
:
[
'chemistry'
],
'
high_school_geography
'
:
[
'geography'
],
'
high_school_mathematics
'
:
[
'math'
],
'
high_school_physics
'
:
[
'physics'
],
'
high_school_politics
'
:
[
'politics'
,
'china specific'
],
'
human_sexuality
'
:
[
'other'
],
'
international_law
'
:
[
'law'
],
'
journalism
'
:
[
'sociology'
],
'
jurisprudence
'
:
[
'law'
],
'
legal_and_moral_basis
'
:
[
'other'
],
'
logical
'
:
[
'philosophy'
],
'
machine_learning
'
:
[
'computer science'
],
'
management
'
:
[
'business'
],
'
marketing
'
:
[
'business'
],
'
marxist_theory
'
:
[
'philosophy'
],
'
modern_chinese
'
:
[
'linguistics'
,
'china specific'
],
'
nutrition
'
:
[
'other'
],
'
philosophy
'
:
[
'philosophy'
],
'
professional_accounting
'
:
[
'business'
],
'
professional_law
'
:
[
'law'
],
'
professional_medicine
'
:
[
'other'
],
'
professional_psychology
'
:
[
'psychology'
],
'
public_relations
'
:
[
'politics'
],
'
security_study
'
:
[
'politics'
],
'
sociology
'
:
[
'culture'
],
'
sports_science
'
:
[
'other'
],
'
traditional_chinese_medicine
'
:
[
'other'
,
'china specific'
],
'
virology
'
:
[
'biology'
],
'
world_history
'
:[
'history'
],
'
world_religions
'
:
[
'global'
],
}
categories
=
{
"
STEM
"
:
[
"
physics
"
,
"
chemistry
"
,
"
biology
"
,
"
computer science
"
,
"
math
"
,
"
engineering
"
,
"
statistics
"
],
"
Humanities
"
:
[
"
history
"
,
"
philosophy
"
,
"
law
"
,
"
arts
"
,
"
literature
"
,
"
global
"
],
"
Social Science
"
:
[
'linguistics'
,
"
business
"
,
"
politics
"
,
"
culture
"
,
"
economics
"
,
"
geography
"
,
"
psychology
"
,
"
education
"
,
"
sociology
"
],
"
Other
"
:[
"
other
"
],
"
China specific
"
:
[
"
china specific
"
],
'
STEM
'
:
[
'
physics
'
,
'
chemistry
'
,
'
biology
'
,
'
computer science
'
,
'
math
'
,
'
engineering
'
,
'
statistics
'
],
'
Humanities
'
:
[
'
history
'
,
'
philosophy
'
,
'
law
'
,
'
arts
'
,
'
literature
'
,
'
global
'
],
'
Social Science
'
:
[
'linguistics'
,
'
business
'
,
'
politics
'
,
'
culture
'
,
'
economics
'
,
'
geography
'
,
'
psychology
'
,
'
education
'
,
'
sociology
'
],
'
Other
'
:[
'
other
'
],
'
China specific
'
:
[
'
china specific
'
],
}
category2subject
=
{}
...
...
configs/summarizers/groups/lawbench.py
View file @
aa2dd2b5
names
=
[
[
"
1-1
"
,
"
article_recitation
"
],
[
"
1-2
"
,
"
knowledge_question_answering
"
],
[
"
2-1
"
,
"
document_proofreading
"
],
[
"
2-2
"
,
"
dispute_focus_identification
"
],
[
"
2-3
"
,
"
marital_disputes_identification
"
],
[
"
2-4
"
,
"
issue_topic_identification
"
],
[
"
2-5
"
,
"
reading_comprehension
"
],
[
"
2-6
"
,
"
named_entity_recognition
"
],
[
"
2-7
"
,
"
opinion_summarization
"
],
[
"
2-8
"
,
"
argument_mining
"
],
[
"
2-9
"
,
"
event_detection
"
],
[
"
2-10
"
,
"
trigger_word_extraction
"
],
[
"
3-1
"
,
"
fact_based_article_prediction
"
],
[
"
3-2
"
,
"
scene_based_article_prediction
"
],
[
"
3-3
"
,
"
charge_prediction
"
],
[
"
3-4
"
,
"
prison_term_prediction_wo_article
"
],
[
"
3-5
"
,
"
prison_term_prediction_w_article
"
],
[
"
3-6
"
,
"
case_analysis
"
],
[
"
3-7
"
,
"
criminal_damages_calculation
"
],
[
"
3-8
"
,
"
consultation
"
],
[
'
1-1
'
,
'
article_recitation
'
],
[
'
1-2
'
,
'
knowledge_question_answering
'
],
[
'
2-1
'
,
'
document_proofreading
'
],
[
'
2-2
'
,
'
dispute_focus_identification
'
],
[
'
2-3
'
,
'
marital_disputes_identification
'
],
[
'
2-4
'
,
'
issue_topic_identification
'
],
[
'
2-5
'
,
'
reading_comprehension
'
],
[
'
2-6
'
,
'
named_entity_recognition
'
],
[
'
2-7
'
,
'
opinion_summarization
'
],
[
'
2-8
'
,
'
argument_mining
'
],
[
'
2-9
'
,
'
event_detection
'
],
[
'
2-10
'
,
'
trigger_word_extraction
'
],
[
'
3-1
'
,
'
fact_based_article_prediction
'
],
[
'
3-2
'
,
'
scene_based_article_prediction
'
],
[
'
3-3
'
,
'
charge_prediction
'
],
[
'
3-4
'
,
'
prison_term_prediction_wo_article
'
],
[
'
3-5
'
,
'
prison_term_prediction_w_article
'
],
[
'
3-6
'
,
'
case_analysis
'
],
[
'
3-7
'
,
'
criminal_damages_calculation
'
],
[
'
3-8
'
,
'
consultation
'
],
]
lawbench_summary_groups
=
[]
...
...
configs/summarizers/groups/leval.py
View file @
aa2dd2b5
leval_summary_groups
=
[
{
"
name
"
:
"
leval
"
,
"
subsets
"
:
[
"
LEval_coursera
"
,
"
LEval_gsm100
"
,
"
LEval_quality
"
,
"
LEval_tpo
"
,
"
LEval_topic_retrieval
"
,
"
LEval_financialqa
"
,
"
LEval_gov_report_summ
"
,
"
LEval_legal_contract_qa
"
,
"
LEval_meeting_summ
"
,
"
LEval_multidocqa
"
,
"
LEval_narrativeqa
"
,
"
LEval_nq
"
,
"
LEval_news_summ
"
,
"
LEval_paper_assistant
"
,
"
LEval_patent_summ
"
,
"
LEval_review_summ
"
,
"
LEval_scientificqa
"
,
"
LEval_tvshow_summ
"
]},
{
'
name
'
:
'
leval
'
,
'
subsets
'
:
[
'
LEval_coursera
'
,
'
LEval_gsm100
'
,
'
LEval_quality
'
,
'
LEval_tpo
'
,
'
LEval_topic_retrieval
'
,
'
LEval_financialqa
'
,
'
LEval_gov_report_summ
'
,
'
LEval_legal_contract_qa
'
,
'
LEval_meeting_summ
'
,
'
LEval_multidocqa
'
,
'
LEval_narrativeqa
'
,
'
LEval_nq
'
,
'
LEval_news_summ
'
,
'
LEval_paper_assistant
'
,
'
LEval_patent_summ
'
,
'
LEval_review_summ
'
,
'
LEval_scientificqa
'
,
'
LEval_tvshow_summ
'
]},
]
configs/summarizers/groups/lveval.py
View file @
aa2dd2b5
len_levels
=
[
"
16k
"
,
"
32k
"
,
"
64k
"
,
"
128k
"
,
"
256k
"
]
len_levels
=
[
'
16k
'
,
'
32k
'
,
'
64k
'
,
'
128k
'
,
'
256k
'
]
subsets_lveval_loogle_SD_mixup
=
[
"
LVEval_loogle_SD_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_loogle_SD_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_cmrc_mixup
=
[
"
LVEval_cmrc_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_cmrc_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_multifieldqa_en_mixup
=
[
"
LVEval_multifieldqa_en_mixup
"
+
"_"
+
len_level
'
LVEval_multifieldqa_en_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_multifieldqa_zh_mixup
=
[
"
LVEval_multifieldqa_zh_mixup
"
+
"_"
+
len_level
'
LVEval_multifieldqa_zh_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_dureader_mixup
=
[
"
LVEval_dureader_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_dureader_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_loogle_CR_mixup
=
[
"
LVEval_loogle_CR_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_loogle_CR_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_loogle_MIR_mixup
=
[
"
LVEval_loogle_MIR_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_loogle_MIR_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_hotpotwikiqa_mixup
=
[
"
LVEval_hotpotwikiqa_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_hotpotwikiqa_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_lic_mixup
=
[
"
LVEval_lic_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_lic_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_factrecall_en
=
[
"
LVEval_factrecall_en
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_factrecall_en
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_factrecall_zh
=
[
"
LVEval_factrecall_zh
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_factrecall_zh
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_single_hop_qa
=
(
...
...
@@ -64,47 +64,47 @@ subsets_lveval_qa = (
lveval_summary_groups
=
[
{
"
name
"
:
"
LVEval_loogle_SD_mixup
"
,
"
subsets
"
:
subsets_lveval_loogle_SD_mixup
,
'
name
'
:
'
LVEval_loogle_SD_mixup
'
,
'
subsets
'
:
subsets_lveval_loogle_SD_mixup
,
},
{
"
name
"
:
"
LVEval_cmrc_mixup
"
,
"
subsets
"
:
subsets_lveval_cmrc_mixup
},
{
'
name
'
:
'
LVEval_cmrc_mixup
'
,
'
subsets
'
:
subsets_lveval_cmrc_mixup
},
{
"
name
"
:
"
LVEval_multifieldqa_en_mixup
"
,
"
subsets
"
:
subsets_lveval_multifieldqa_en_mixup
,
'
name
'
:
'
LVEval_multifieldqa_en_mixup
'
,
'
subsets
'
:
subsets_lveval_multifieldqa_en_mixup
,
},
{
"
name
"
:
"
LVEval_multifieldqa_zh_mixup
"
,
"
subsets
"
:
subsets_lveval_multifieldqa_zh_mixup
,
'
name
'
:
'
LVEval_multifieldqa_zh_mixup
'
,
'
subsets
'
:
subsets_lveval_multifieldqa_zh_mixup
,
},
{
"
name
"
:
"
LVEval_dureader_mixup
"
,
"
subsets
"
:
subsets_lveval_dureader_mixup
,
'
name
'
:
'
LVEval_dureader_mixup
'
,
'
subsets
'
:
subsets_lveval_dureader_mixup
,
},
{
"
name
"
:
"
LVEval_loogle_CR_mixup
"
,
"
subsets
"
:
subsets_lveval_loogle_CR_mixup
,
'
name
'
:
'
LVEval_loogle_CR_mixup
'
,
'
subsets
'
:
subsets_lveval_loogle_CR_mixup
,
},
{
"
name
"
:
"
LVEval_loogle_MIR_mixup
"
,
"
subsets
"
:
subsets_lveval_loogle_MIR_mixup
,
'
name
'
:
'
LVEval_loogle_MIR_mixup
'
,
'
subsets
'
:
subsets_lveval_loogle_MIR_mixup
,
},
{
"
name
"
:
"
LVEval_hotpotwikiqa_mixup
"
,
"
subsets
"
:
subsets_lveval_hotpotwikiqa_mixup
,
'
name
'
:
'
LVEval_hotpotwikiqa_mixup
'
,
'
subsets
'
:
subsets_lveval_hotpotwikiqa_mixup
,
},
{
"
name
"
:
"
LVEval_lic_mixup
"
,
"
subsets
"
:
subsets_lveval_lic_mixup
},
{
"
name
"
:
"
LVEval_factrecall_en
"
,
"
subsets
"
:
subsets_lveval_factrecall_en
},
{
"
name
"
:
"
LVEval_factrecall_zh
"
,
"
subsets
"
:
subsets_lveval_factrecall_zh
},
{
"
name
"
:
"
LVEval_single_hop_qa
"
,
"
subsets
"
:
subsets_lveval_single_hop_qa
},
{
'
name
'
:
'
LVEval_lic_mixup
'
,
'
subsets
'
:
subsets_lveval_lic_mixup
},
{
'
name
'
:
'
LVEval_factrecall_en
'
,
'
subsets
'
:
subsets_lveval_factrecall_en
},
{
'
name
'
:
'
LVEval_factrecall_zh
'
,
'
subsets
'
:
subsets_lveval_factrecall_zh
},
{
'
name
'
:
'
LVEval_single_hop_qa
'
,
'
subsets
'
:
subsets_lveval_single_hop_qa
},
{
"
name
"
:
"
LVEval_single_hop_cqa
"
,
"
subsets
"
:
subsets_lveval_single_hop_cqa
,
'
name
'
:
'
LVEval_single_hop_cqa
'
,
'
subsets
'
:
subsets_lveval_single_hop_cqa
,
},
{
"
name
"
:
"
LVEval_multi_hop_qa
"
,
"
subsets
"
:
subsets_lveval_multi_hop_qa
},
{
"
name
"
:
"
LVEval_multi_hop_cqa
"
,
"
subsets
"
:
subsets_lveval_multi_hop_cqa
},
{
'
name
'
:
'
LVEval_multi_hop_qa
'
,
'
subsets
'
:
subsets_lveval_multi_hop_qa
},
{
'
name
'
:
'
LVEval_multi_hop_cqa
'
,
'
subsets
'
:
subsets_lveval_multi_hop_cqa
},
{
"
name
"
:
"
LVEval_factrecall_cqa
"
,
"
subsets
"
:
subsets_lveval_factrecall_cqa
,
'
name
'
:
'
LVEval_factrecall_cqa
'
,
'
subsets
'
:
subsets_lveval_factrecall_cqa
,
},
{
"
name
"
:
"
LVEval_qa
"
,
"
subsets
"
:
subsets_lveval_qa
},
{
'
name
'
:
'
LVEval_qa
'
,
'
subsets
'
:
subsets_lveval_qa
},
]
configs/summarizers/groups/mgsm.py
View file @
aa2dd2b5
ALL_LANGUAGES
=
[
"
bn
"
,
"
de
"
,
"
en
"
,
"
es
"
,
"
fr
"
,
"
ja
"
,
"
ru
"
,
"
sw
"
,
"
te
"
,
"
th
"
,
"
zh
"
]
LATIN_LANGUAGES
=
[
"
de
"
,
"
en
"
,
"
es
"
,
"
fr
"
,
"
sw
"
]
NON_LATIN_LANGUAGES
=
[
"
bn
"
,
"
ja
"
,
"
ru
"
,
"
te
"
,
"
th
"
,
"
zh
"
]
ALL_LANGUAGES
=
[
'
bn
'
,
'
de
'
,
'
en
'
,
'
es
'
,
'
fr
'
,
'
ja
'
,
'
ru
'
,
'
sw
'
,
'
te
'
,
'
th
'
,
'
zh
'
]
LATIN_LANGUAGES
=
[
'
de
'
,
'
en
'
,
'
es
'
,
'
fr
'
,
'
sw
'
]
NON_LATIN_LANGUAGES
=
[
'
bn
'
,
'
ja
'
,
'
ru
'
,
'
te
'
,
'
th
'
,
'
zh
'
]
mgsm_summary_groups
=
[
{
'name'
:
'mgsm_latin'
,
'subsets'
:
[
f
'mgsm_
{
lang
}
'
for
lang
in
LATIN_LANGUAGES
]},
...
...
configs/summarizers/groups/scibench.py
View file @
aa2dd2b5
scibench_summary_groups
=
[]
scibench_tasks
=
[
"
atkins
"
,
"
calculus
"
,
"
chemmc
"
,
"
class
"
,
"
diff
"
,
"
fund
"
,
"
matter
"
,
"
quan
"
,
"
stat
"
,
"
thermo
"
]
for
suffix
in
[
""
,
"
_zs-cot
"
,
"
_fs
"
,
"
_fs-cot
"
]:
subsets
=
[
f
"
scibench-
{
subset
}{
suffix
}
"
for
subset
in
scibench_tasks
]
scibench_tasks
=
[
'
atkins
'
,
'
calculus
'
,
'
chemmc
'
,
'
class
'
,
'
diff
'
,
'
fund
'
,
'
matter
'
,
'
quan
'
,
'
stat
'
,
'
thermo
'
]
for
suffix
in
[
''
,
'
_zs-cot
'
,
'
_fs
'
,
'
_fs-cot
'
]:
subsets
=
[
f
'
scibench-
{
subset
}{
suffix
}
'
for
subset
in
scibench_tasks
]
scibench_summary_groups
.
append
({
'name'
:
f
'scibench
{
suffix
}
'
,
'subsets'
:
subsets
})
configs/summarizers/groups/teval.py
View file @
aa2dd2b5
...
...
@@ -71,4 +71,3 @@ for group in _base_summary_groups:
group
[
'name'
]
=
group
[
'name'
]
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
]
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
teval_summary_groups
.
append
(
group
)
configs/summarizers/groups/xiezhi.py
View file @
aa2dd2b5
xiezhi_summary_groups
=
[]
_xiezhi
=
[
"
xiezhi-spec_eng
"
,
"
xiezhi-spec_chn
"
,
"
xiezhi-inter_eng
"
,
"
xiezhi-inter_chn
"
]
_xiezhi
=
[
'
xiezhi-spec_eng
'
,
'
xiezhi-spec_chn
'
,
'
xiezhi-inter_eng
'
,
'
xiezhi-inter_chn
'
]
xiezhi_summary_groups
.
append
({
'name'
:
'xiezhi'
,
'subsets'
:
_xiezhi
})
configs/summarizers/infinitebench.py
View file @
aa2dd2b5
...
...
@@ -2,7 +2,7 @@ from mmengine.config import read_base
with
read_base
():
from
.groups.infinitebench
import
infinitebench_summary_groups
summarizer
=
dict
(
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/internlm2_keyset.py
View file @
aa2dd2b5
...
...
@@ -16,5 +16,5 @@ summarizer = dict(
[
'sanitized_mbpp'
,
'score'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/lawbench.py
View file @
aa2dd2b5
...
...
@@ -50,7 +50,7 @@ summarizer = dict(
'lawbench-3-7-criminal_damages_calculation-1-shot'
,
'lawbench-3-8-consultation-1-shot'
,
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
prompt_db
=
dict
(
database_path
=
'configs/datasets/log.json'
,
config_dir
=
'configs/datasets'
,
...
...
configs/summarizers/leaderboard.py
View file @
aa2dd2b5
...
...
@@ -13,11 +13,11 @@ with read_base():
other_summary_groups
=
[]
other_summary_groups
.
append
({
'name'
:
'Exam'
,
'subsets'
:
[
"
ceval
"
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
"
GaokaoBench
"
,
'ARC-c'
,
'ARC-e'
]})
other_summary_groups
.
append
({
'name'
:
'Exam'
,
'subsets'
:
[
'
ceval
'
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
'
GaokaoBench
'
,
'ARC-c'
,
'ARC-e'
]})
other_summary_groups
.
append
({
'name'
:
'Language'
,
'subsets'
:
[
'WiC'
,
'chid-dev'
,
'afqmc-dev'
,
'WSC'
,
'tydiqa-goldp'
,
'flores_100'
]})
other_summary_groups
.
append
({
'name'
:
'Knowledge'
,
'subsets'
:
[
'BoolQ'
,
'commonsense_qa'
,
'triviaqa'
,
'nq'
]})
other_summary_groups
.
append
({
'name'
:
'Understanding'
,
'subsets'
:
[
'C3'
,
'race-middle'
,
'race-high'
,
'openbookqa_fact'
,
'csl_dev'
,
'lcsts'
,
'Xsum'
,
'eprstmt-dev'
,
'lambada'
]})
other_summary_groups
.
append
({
'name'
:
'Reasoning'
,
'subsets'
:
[
'cmnli'
,
'ocnli'
,
'AX_b'
,
'AX_g'
,
'RTE'
,
'COPA'
,
'ReCoRD'
,
'hellaswag'
,
'piqa'
,
'siqa'
,
'math'
,
'gsm8k'
,
'drop'
,
'openai_humaneval'
,
'mbpp'
,
"
bbh
"
]})
other_summary_groups
.
append
({
'name'
:
'Reasoning'
,
'subsets'
:
[
'cmnli'
,
'ocnli'
,
'AX_b'
,
'AX_g'
,
'RTE'
,
'COPA'
,
'ReCoRD'
,
'hellaswag'
,
'piqa'
,
'siqa'
,
'math'
,
'gsm8k'
,
'drop'
,
'openai_humaneval'
,
'mbpp'
,
'
bbh
'
]})
other_summary_groups
.
append
({
'name'
:
'Overall'
,
'subsets'
:
[
'Exam'
,
'Language'
,
'Knowledge'
,
'Understanding'
,
'Reasoning'
]})
summarizer
=
dict
(
...
...
@@ -30,11 +30,11 @@ summarizer = dict(
'Reasoning'
,
'--------- 考试 Exam ---------'
,
# category
# 'Mixed', # subcategory
"
ceval
"
,
'
ceval
'
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
"
GaokaoBench
"
,
'
GaokaoBench
'
,
'ARC-c'
,
'ARC-e'
,
'--------- 语言 Language ---------'
,
# category
...
...
@@ -92,8 +92,8 @@ summarizer = dict(
'openai_humaneval'
,
'mbpp'
,
# '综合推理', # subcategory
"
bbh
"
,
'
bbh
'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/lveval.py
View file @
aa2dd2b5
...
...
@@ -5,110 +5,110 @@ with read_base():
summarizer
=
dict
(
dataset_abbrs
=
[
"
----------------------------------------
"
,
"
--------- LVEval All ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_qa
"
,
"
----------------------------------------
"
,
"
--------- LVEval Tasks All ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_single_hop_qa
"
,
"
LVEval_single_hop_cqa
"
,
"
LVEval_multi_hop_qa
"
,
"
LVEval_multi_hop_cqa
"
,
"
LVEval_factrecall_cqa
"
,
"
----------------------------------------
"
,
"
--------- LVEval Datasets All ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_loogle_SD_mixup
"
,
"
LVEval_cmrc_mixup
"
,
"
LVEval_multifieldqa_en_mixup
"
,
"
LVEval_multifieldqa_zh_mixup
"
,
"
LVEval_dureader_mixup
"
,
"
LVEval_loogle_CR_mixup
"
,
"
LVEval_loogle_MIR_mixup
"
,
"
LVEval_hotpotwikiqa_mixup
"
,
"
LVEval_lic_mixup
"
,
"
LVEval_factrecall_en
"
,
"
LVEval_factrecall_zh
"
,
"
----------------------------------------
"
,
"
--------- LVEval Single_Hop QA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_loogle_SD_mixup_16k
"
,
"
LVEval_loogle_SD_mixup_32k
"
,
"
LVEval_loogle_SD_mixup_64k
"
,
"
LVEval_loogle_SD_mixup_128k
"
,
"
LVEval_loogle_SD_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_cmrc_mixup_16k
"
,
"
LVEval_cmrc_mixup_32k
"
,
"
LVEval_cmrc_mixup_64k
"
,
"
LVEval_cmrc_mixup_128k
"
,
"
LVEval_cmrc_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Single_Hop CQA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_multifieldqa_en_mixup_16k
"
,
"
LVEval_multifieldqa_en_mixup_32k
"
,
"
LVEval_multifieldqa_en_mixup_64k
"
,
"
LVEval_multifieldqa_en_mixup_128k
"
,
"
LVEval_multifieldqa_en_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_multifieldqa_zh_mixup_16k
"
,
"
LVEval_multifieldqa_zh_mixup_32k
"
,
"
LVEval_multifieldqa_zh_mixup_64k
"
,
"
LVEval_multifieldqa_zh_mixup_128k
"
,
"
LVEval_multifieldqa_zh_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Multi_Hop QA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_dureader_mixup_16k
"
,
"
LVEval_dureader_mixup_32k
"
,
"
LVEval_dureader_mixup_64k
"
,
"
LVEval_dureader_mixup_128k
"
,
"
LVEval_dureader_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_loogle_CR_mixup_16k
"
,
"
LVEval_loogle_CR_mixup_32k
"
,
"
LVEval_loogle_CR_mixup_64k
"
,
"
LVEval_loogle_CR_mixup_128k
"
,
"
LVEval_loogle_CR_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_loogle_MIR_mixup_16k
"
,
"
LVEval_loogle_MIR_mixup_32k
"
,
"
LVEval_loogle_MIR_mixup_64k
"
,
"
LVEval_loogle_MIR_mixup_128k
"
,
"
LVEval_loogle_MIR_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Multi_Hop CQA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_hotpotwikiqa_mixup_16k
"
,
"
LVEval_hotpotwikiqa_mixup_32k
"
,
"
LVEval_hotpotwikiqa_mixup_64k
"
,
"
LVEval_hotpotwikiqa_mixup_128k
"
,
"
LVEval_hotpotwikiqa_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_lic_mixup_16k
"
,
"
LVEval_lic_mixup_32k
"
,
"
LVEval_lic_mixup_64k
"
,
"
LVEval_lic_mixup_128k
"
,
"
LVEval_lic_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Factrecall CQA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_factrecall_en_16k
"
,
"
LVEval_factrecall_en_32k
"
,
"
LVEval_factrecall_en_64k
"
,
"
LVEval_factrecall_en_128k
"
,
"
LVEval_factrecall_en_256k
"
,
"
----------------------------------------
"
,
"
LVEval_factrecall_zh_16k
"
,
"
LVEval_factrecall_zh_32k
"
,
"
LVEval_factrecall_zh_64k
"
,
"
LVEval_factrecall_zh_128k
"
,
"
LVEval_factrecall_zh_256k
"
,
'
----------------------------------------
'
,
'
--------- LVEval All ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_qa
'
,
'
----------------------------------------
'
,
'
--------- LVEval Tasks All ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_single_hop_qa
'
,
'
LVEval_single_hop_cqa
'
,
'
LVEval_multi_hop_qa
'
,
'
LVEval_multi_hop_cqa
'
,
'
LVEval_factrecall_cqa
'
,
'
----------------------------------------
'
,
'
--------- LVEval Datasets All ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_loogle_SD_mixup
'
,
'
LVEval_cmrc_mixup
'
,
'
LVEval_multifieldqa_en_mixup
'
,
'
LVEval_multifieldqa_zh_mixup
'
,
'
LVEval_dureader_mixup
'
,
'
LVEval_loogle_CR_mixup
'
,
'
LVEval_loogle_MIR_mixup
'
,
'
LVEval_hotpotwikiqa_mixup
'
,
'
LVEval_lic_mixup
'
,
'
LVEval_factrecall_en
'
,
'
LVEval_factrecall_zh
'
,
'
----------------------------------------
'
,
'
--------- LVEval Single_Hop QA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_loogle_SD_mixup_16k
'
,
'
LVEval_loogle_SD_mixup_32k
'
,
'
LVEval_loogle_SD_mixup_64k
'
,
'
LVEval_loogle_SD_mixup_128k
'
,
'
LVEval_loogle_SD_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_cmrc_mixup_16k
'
,
'
LVEval_cmrc_mixup_32k
'
,
'
LVEval_cmrc_mixup_64k
'
,
'
LVEval_cmrc_mixup_128k
'
,
'
LVEval_cmrc_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Single_Hop CQA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_multifieldqa_en_mixup_16k
'
,
'
LVEval_multifieldqa_en_mixup_32k
'
,
'
LVEval_multifieldqa_en_mixup_64k
'
,
'
LVEval_multifieldqa_en_mixup_128k
'
,
'
LVEval_multifieldqa_en_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_multifieldqa_zh_mixup_16k
'
,
'
LVEval_multifieldqa_zh_mixup_32k
'
,
'
LVEval_multifieldqa_zh_mixup_64k
'
,
'
LVEval_multifieldqa_zh_mixup_128k
'
,
'
LVEval_multifieldqa_zh_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Multi_Hop QA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_dureader_mixup_16k
'
,
'
LVEval_dureader_mixup_32k
'
,
'
LVEval_dureader_mixup_64k
'
,
'
LVEval_dureader_mixup_128k
'
,
'
LVEval_dureader_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_loogle_CR_mixup_16k
'
,
'
LVEval_loogle_CR_mixup_32k
'
,
'
LVEval_loogle_CR_mixup_64k
'
,
'
LVEval_loogle_CR_mixup_128k
'
,
'
LVEval_loogle_CR_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_loogle_MIR_mixup_16k
'
,
'
LVEval_loogle_MIR_mixup_32k
'
,
'
LVEval_loogle_MIR_mixup_64k
'
,
'
LVEval_loogle_MIR_mixup_128k
'
,
'
LVEval_loogle_MIR_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Multi_Hop CQA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_hotpotwikiqa_mixup_16k
'
,
'
LVEval_hotpotwikiqa_mixup_32k
'
,
'
LVEval_hotpotwikiqa_mixup_64k
'
,
'
LVEval_hotpotwikiqa_mixup_128k
'
,
'
LVEval_hotpotwikiqa_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_lic_mixup_16k
'
,
'
LVEval_lic_mixup_32k
'
,
'
LVEval_lic_mixup_64k
'
,
'
LVEval_lic_mixup_128k
'
,
'
LVEval_lic_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Factrecall CQA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_factrecall_en_16k
'
,
'
LVEval_factrecall_en_32k
'
,
'
LVEval_factrecall_en_64k
'
,
'
LVEval_factrecall_en_128k
'
,
'
LVEval_factrecall_en_256k
'
,
'
----------------------------------------
'
,
'
LVEval_factrecall_zh_16k
'
,
'
LVEval_factrecall_zh_32k
'
,
'
LVEval_factrecall_zh_64k
'
,
'
LVEval_factrecall_zh_128k
'
,
'
LVEval_factrecall_zh_256k
'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]
),
)
configs/summarizers/math_agent.py
View file @
aa2dd2b5
...
...
@@ -21,5 +21,5 @@ summarizer = dict(
'mathbench-circular-and-cloze-agent'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[])
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[])
)
configs/summarizers/math_baseline.py
View file @
aa2dd2b5
...
...
@@ -15,5 +15,5 @@ summarizer = dict(
'mathbench-circular-and-cloze'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[])
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[])
)
Prev
1
…
28
29
30
31
32
33
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment