Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
aa2dd2b5
Unverified
Commit
aa2dd2b5
authored
May 14, 2024
by
Fengzhe Zhou
Committed by
GitHub
May 14, 2024
Browse files
[Format] Add config lints (#892)
parent
3dbba119
Changes
648
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
359 additions
and
360 deletions
+359
-360
configs/summarizers/compassbench_v1_objective.py
configs/summarizers/compassbench_v1_objective.py
+25
-25
configs/summarizers/contamination.py
configs/summarizers/contamination.py
+59
-59
configs/summarizers/example.py
configs/summarizers/example.py
+1
-1
configs/summarizers/groups/MMLUArabic.py
configs/summarizers/groups/MMLUArabic.py
+20
-20
configs/summarizers/groups/cibench.py
configs/summarizers/groups/cibench.py
+1
-1
configs/summarizers/groups/cmmlu.py
configs/summarizers/groups/cmmlu.py
+72
-72
configs/summarizers/groups/lawbench.py
configs/summarizers/groups/lawbench.py
+20
-20
configs/summarizers/groups/leval.py
configs/summarizers/groups/leval.py
+1
-1
configs/summarizers/groups/lveval.py
configs/summarizers/groups/lveval.py
+38
-38
configs/summarizers/groups/mgsm.py
configs/summarizers/groups/mgsm.py
+3
-3
configs/summarizers/groups/scibench.py
configs/summarizers/groups/scibench.py
+3
-3
configs/summarizers/groups/teval.py
configs/summarizers/groups/teval.py
+0
-1
configs/summarizers/groups/xiezhi.py
configs/summarizers/groups/xiezhi.py
+1
-1
configs/summarizers/infinitebench.py
configs/summarizers/infinitebench.py
+2
-2
configs/summarizers/internlm2_keyset.py
configs/summarizers/internlm2_keyset.py
+1
-1
configs/summarizers/lawbench.py
configs/summarizers/lawbench.py
+1
-1
configs/summarizers/leaderboard.py
configs/summarizers/leaderboard.py
+6
-6
configs/summarizers/lveval.py
configs/summarizers/lveval.py
+103
-103
configs/summarizers/math_agent.py
configs/summarizers/math_agent.py
+1
-1
configs/summarizers/math_baseline.py
configs/summarizers/math_baseline.py
+1
-1
No files found.
configs/summarizers/compassbench_v1_objective.py
View file @
aa2dd2b5
...
...
@@ -115,36 +115,36 @@ agent_summary_groups = [
other_summary_groups
=
[
{
"
name
"
:
"
average_cn
"
,
"
subsets
"
:
[
[
"
language_zh_perf_4_and_non_mcq
"
,
"
naive_average
"
],
[
"
knowledge_cn
"
,
"
perf_4
"
],
[
"
reasonbench_cn_circular
"
,
"
perf_circular
"
],
[
"
math_perf_4_and_fill_in_blank_cn
"
,
"
naive_average
"
],
[
"
code_cn
"
,
"
naive_average
"
],
[
"
agent_cn
"
,
"
naive_average
"
],
'
name
'
:
'
average_cn
'
,
'
subsets
'
:
[
[
'
language_zh_perf_4_and_non_mcq
'
,
'
naive_average
'
],
[
'
knowledge_cn
'
,
'
perf_4
'
],
[
'
reasonbench_cn_circular
'
,
'
perf_circular
'
],
[
'
math_perf_4_and_fill_in_blank_cn
'
,
'
naive_average
'
],
[
'
code_cn
'
,
'
naive_average
'
],
[
'
agent_cn
'
,
'
naive_average
'
],
],
},
{
"
name
"
:
"
average_en
"
,
"
subsets
"
:
[
[
"
language_en_perf_4_and_non_mcq
"
,
"
naive_average
"
],
[
"
compassbench_v1_knowledge-mixed-cloze_en
"
,
"
score
"
],
[
"
reasonbench_en_circular
"
,
"
perf_circular
"
],
[
"
math_perf_4_and_fill_in_blank_en
"
,
"
naive_average
"
],
[
"
code_en
"
,
"
naive_average
"
],
[
"
agent_en
"
,
"
naive_average
"
],
'
name
'
:
'
average_en
'
,
'
subsets
'
:
[
[
'
language_en_perf_4_and_non_mcq
'
,
'
naive_average
'
],
[
'
compassbench_v1_knowledge-mixed-cloze_en
'
,
'
score
'
],
[
'
reasonbench_en_circular
'
,
'
perf_circular
'
],
[
'
math_perf_4_and_fill_in_blank_en
'
,
'
naive_average
'
],
[
'
code_en
'
,
'
naive_average
'
],
[
'
agent_en
'
,
'
naive_average
'
],
],
},
{
"
name
"
:
"
average
"
,
"
subsets
"
:
[
[
"
language_perf_4_and_non_mcq
"
,
"
naive_average
"
],
[
"
knowledge_perf_4_and_cloze
"
,
"
naive_average
"
],
[
"
reasonbench
"
,
"
perf_circular
"
],
[
"
math_perf_4_and_fill_in_blank
"
,
"
naive_average
"
],
[
"
code
"
,
"
naive_average
"
],
[
"
agent
"
,
"
naive_average
"
],
'
name
'
:
'
average
'
,
'
subsets
'
:
[
[
'
language_perf_4_and_non_mcq
'
,
'
naive_average
'
],
[
'
knowledge_perf_4_and_cloze
'
,
'
naive_average
'
],
[
'
reasonbench
'
,
'
perf_circular
'
],
[
'
math_perf_4_and_fill_in_blank
'
,
'
naive_average
'
],
[
'
code
'
,
'
naive_average
'
],
[
'
agent
'
,
'
naive_average
'
],
],
},
]
...
...
@@ -223,5 +223,5 @@ summarizer = dict(
[
'plugin_eval-mus-p10_one_review_zh'
,
'naive_average'
],
[
'plugin_eval-mus-p10_one_review'
,
'naive_average'
],
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/contamination.py
View file @
aa2dd2b5
...
...
@@ -60,63 +60,63 @@ ceval_category_weights = {
}
mmlu_category_weights
=
{
"
business_ethics
"
:
{
"
accuracy - clean
"
:
44
,
"
accuracy - input contaminated
"
:
16
,
"
accuracy - input-and-label contaminated
"
:
38
,
"
accuracy - not labeled
"
:
1
},
"
security_studies
"
:
{
"
accuracy - clean
"
:
188
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
47
,
"
accuracy - not labeled
"
:
0
},
"
high_school_us_history
"
:
{
"
accuracy - clean
"
:
42
,
"
accuracy - input contaminated
"
:
0
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
161
},
"
moral_disputes
"
:
{
"
accuracy - clean
"
:
105
,
"
accuracy - input contaminated
"
:
13
,
"
accuracy - input-and-label contaminated
"
:
168
,
"
accuracy - not labeled
"
:
59
},
"
philosophy
"
:
{
"
accuracy - clean
"
:
81
,
"
accuracy - input contaminated
"
:
11
,
"
accuracy - input-and-label contaminated
"
:
187
,
"
accuracy - not labeled
"
:
31
},
"
public_relations
"
:
{
"
accuracy - clean
"
:
75
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
26
,
"
accuracy - not labeled
"
:
0
},
"
high_school_microeconomics
"
:
{
"
accuracy - clean
"
:
82
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
146
,
"
accuracy - not labeled
"
:
0
},
"
human_sexuality
"
:
{
"
accuracy - clean
"
:
108
,
"
accuracy - input contaminated
"
:
3
,
"
accuracy - input-and-label contaminated
"
:
15
,
"
accuracy - not labeled
"
:
4
},
"
professional_accounting
"
:
{
"
accuracy - clean
"
:
88
,
"
accuracy - input contaminated
"
:
40
,
"
accuracy - input-and-label contaminated
"
:
152
,
"
accuracy - not labeled
"
:
1
},
"
high_school_government_and_politics
"
:
{
"
accuracy - clean
"
:
104
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
82
,
"
accuracy - not labeled
"
:
0
},
"
sociology
"
:
{
"
accuracy - clean
"
:
105
,
"
accuracy - input contaminated
"
:
4
,
"
accuracy - input-and-label contaminated
"
:
91
,
"
accuracy - not labeled
"
:
0
},
"
conceptual_physics
"
:
{
"
accuracy - clean
"
:
79
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
147
,
"
accuracy - not labeled
"
:
0
},
"
human_aging
"
:
{
"
accuracy - clean
"
:
208
,
"
accuracy - input contaminated
"
:
1
,
"
accuracy - input-and-label contaminated
"
:
13
,
"
accuracy - not labeled
"
:
0
},
"
high_school_psychology
"
:
{
"
accuracy - clean
"
:
108
,
"
accuracy - input contaminated
"
:
26
,
"
accuracy - input-and-label contaminated
"
:
162
,
"
accuracy - not labeled
"
:
248
},
"
jurisprudence
"
:
{
"
accuracy - clean
"
:
59
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
43
,
"
accuracy - not labeled
"
:
0
},
"
moral_scenarios
"
:
{
"
accuracy - clean
"
:
320
,
"
accuracy - input contaminated
"
:
0
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
574
},
"
college_medicine
"
:
{
"
accuracy - clean
"
:
107
,
"
accuracy - input contaminated
"
:
16
,
"
accuracy - input-and-label contaminated
"
:
44
,
"
accuracy - not labeled
"
:
5
},
"
high_school_world_history
"
:
{
"
accuracy - clean
"
:
61
,
"
accuracy - input contaminated
"
:
2
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
173
},
"
virology
"
:
{
"
accuracy - clean
"
:
104
,
"
accuracy - input contaminated
"
:
3
,
"
accuracy - input-and-label contaminated
"
:
58
,
"
accuracy - not labeled
"
:
0
},
"
high_school_statistics
"
:
{
"
accuracy - clean
"
:
96
,
"
accuracy - input contaminated
"
:
43
,
"
accuracy - input-and-label contaminated
"
:
76
,
"
accuracy - not labeled
"
:
0
},
"
nutrition
"
:
{
"
accuracy - clean
"
:
172
,
"
accuracy - input contaminated
"
:
11
,
"
accuracy - input-and-label contaminated
"
:
98
,
"
accuracy - not labeled
"
:
24
},
"
abstract_algebra
"
:
{
"
accuracy - clean
"
:
84
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
7
,
"
accuracy - not labeled
"
:
0
},
"
high_school_geography
"
:
{
"
accuracy - clean
"
:
91
,
"
accuracy - input contaminated
"
:
1
,
"
accuracy - input-and-label contaminated
"
:
105
,
"
accuracy - not labeled
"
:
0
},
"
econometrics
"
:
{
"
accuracy - clean
"
:
62
,
"
accuracy - input contaminated
"
:
13
,
"
accuracy - input-and-label contaminated
"
:
38
,
"
accuracy - not labeled
"
:
0
},
"
marketing
"
:
{
"
accuracy - clean
"
:
115
,
"
accuracy - input contaminated
"
:
15
,
"
accuracy - input-and-label contaminated
"
:
101
,
"
accuracy - not labeled
"
:
2
},
"
high_school_chemistry
"
:
{
"
accuracy - clean
"
:
108
,
"
accuracy - input contaminated
"
:
25
,
"
accuracy - input-and-label contaminated
"
:
69
,
"
accuracy - not labeled
"
:
0
},
"
prehistory
"
:
{
"
accuracy - clean
"
:
154
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
107
,
"
accuracy - not labeled
"
:
57
},
"
college_physics
"
:
{
"
accuracy - clean
"
:
25
,
"
accuracy - input contaminated
"
:
20
,
"
accuracy - input-and-label contaminated
"
:
57
,
"
accuracy - not labeled
"
:
0
},
"
management
"
:
{
"
accuracy - clean
"
:
35
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
62
,
"
accuracy - not labeled
"
:
0
},
"
college_biology
"
:
{
"
accuracy - clean
"
:
91
,
"
accuracy - input contaminated
"
:
12
,
"
accuracy - input-and-label contaminated
"
:
40
,
"
accuracy - not labeled
"
:
0
},
"
high_school_biology
"
:
{
"
accuracy - clean
"
:
128
,
"
accuracy - input contaminated
"
:
17
,
"
accuracy - input-and-label contaminated
"
:
135
,
"
accuracy - not labeled
"
:
29
},
"
high_school_physics
"
:
{
"
accuracy - clean
"
:
42
,
"
accuracy - input contaminated
"
:
28
,
"
accuracy - input-and-label contaminated
"
:
80
,
"
accuracy - not labeled
"
:
0
},
"
logical_fallacies
"
:
{
"
accuracy - clean
"
:
133
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
24
,
"
accuracy - not labeled
"
:
0
},
"
medical_genetics
"
:
{
"
accuracy - clean
"
:
49
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
43
,
"
accuracy - not labeled
"
:
1
},
"
machine_learning
"
:
{
"
accuracy - clean
"
:
71
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
32
,
"
accuracy - not labeled
"
:
0
},
"
professional_law
"
:
{
"
accuracy - clean
"
:
401
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
5
,
"
accuracy - not labeled
"
:
1119
},
"
professional_psychology
"
:
{
"
accuracy - clean
"
:
265
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
27
,
"
accuracy - not labeled
"
:
310
},
"
global_facts
"
:
{
"
accuracy - clean
"
:
89
,
"
accuracy - input contaminated
"
:
5
,
"
accuracy - input-and-label contaminated
"
:
5
,
"
accuracy - not labeled
"
:
0
},
"
us_foreign_policy
"
:
{
"
accuracy - clean
"
:
71
,
"
accuracy - input contaminated
"
:
3
,
"
accuracy - input-and-label contaminated
"
:
25
,
"
accuracy - not labeled
"
:
0
},
"
international_law
"
:
{
"
accuracy - clean
"
:
73
,
"
accuracy - input contaminated
"
:
1
,
"
accuracy - input-and-label contaminated
"
:
46
,
"
accuracy - not labeled
"
:
0
},
"
clinical_knowledge
"
:
{
"
accuracy - clean
"
:
172
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
86
,
"
accuracy - not labeled
"
:
0
},
"
high_school_mathematics
"
:
{
"
accuracy - clean
"
:
178
,
"
accuracy - input contaminated
"
:
59
,
"
accuracy - input-and-label contaminated
"
:
32
,
"
accuracy - not labeled
"
:
0
},
"
high_school_computer_science
"
:
{
"
accuracy - clean
"
:
62
,
"
accuracy - input contaminated
"
:
7
,
"
accuracy - input-and-label contaminated
"
:
28
,
"
accuracy - not labeled
"
:
2
},
"
college_computer_science
"
:
{
"
accuracy - clean
"
:
68
,
"
accuracy - input contaminated
"
:
15
,
"
accuracy - input-and-label contaminated
"
:
15
,
"
accuracy - not labeled
"
:
1
},
"
electrical_engineering
"
:
{
"
accuracy - clean
"
:
75
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
61
,
"
accuracy - not labeled
"
:
0
},
"
college_mathematics
"
:
{
"
accuracy - clean
"
:
61
,
"
accuracy - input contaminated
"
:
13
,
"
accuracy - input-and-label contaminated
"
:
26
,
"
accuracy - not labeled
"
:
0
},
"
computer_security
"
:
{
"
accuracy - clean
"
:
55
,
"
accuracy - input contaminated
"
:
8
,
"
accuracy - input-and-label contaminated
"
:
36
,
"
accuracy - not labeled
"
:
0
},
"
high_school_macroeconomics
"
:
{
"
accuracy - clean
"
:
102
,
"
accuracy - input contaminated
"
:
14
,
"
accuracy - input-and-label contaminated
"
:
173
,
"
accuracy - not labeled
"
:
100
},
"
astronomy
"
:
{
"
accuracy - clean
"
:
112
,
"
accuracy - input contaminated
"
:
4
,
"
accuracy - input-and-label contaminated
"
:
35
,
"
accuracy - not labeled
"
:
0
},
"
college_chemistry
"
:
{
"
accuracy - clean
"
:
46
,
"
accuracy - input contaminated
"
:
19
,
"
accuracy - input-and-label contaminated
"
:
34
,
"
accuracy - not labeled
"
:
0
},
"
high_school_european_history
"
:
{
"
accuracy - clean
"
:
41
,
"
accuracy - input contaminated
"
:
0
,
"
accuracy - input-and-label contaminated
"
:
0
,
"
accuracy - not labeled
"
:
123
},
"
miscellaneous
"
:
{
"
accuracy - clean
"
:
256
,
"
accuracy - input contaminated
"
:
9
,
"
accuracy - input-and-label contaminated
"
:
40
,
"
accuracy - not labeled
"
:
477
},
"
formal_logic
"
:
{
"
accuracy - clean
"
:
92
,
"
accuracy - input contaminated
"
:
12
,
"
accuracy - input-and-label contaminated
"
:
21
,
"
accuracy - not labeled
"
:
0
},
"
elementary_mathematics
"
:
{
"
accuracy - clean
"
:
155
,
"
accuracy - input contaminated
"
:
31
,
"
accuracy - input-and-label contaminated
"
:
103
,
"
accuracy - not labeled
"
:
88
},
"
world_religions
"
:
{
"
accuracy - clean
"
:
130
,
"
accuracy - input contaminated
"
:
4
,
"
accuracy - input-and-label contaminated
"
:
36
,
"
accuracy - not labeled
"
:
0
},
"
professional_medicine
"
:
{
"
accuracy - clean
"
:
191
,
"
accuracy - input contaminated
"
:
43
,
"
accuracy - input-and-label contaminated
"
:
1
,
"
accuracy - not labeled
"
:
36
},
"
anatomy
"
:
{
"
accuracy - clean
"
:
52
,
"
accuracy - input contaminated
"
:
6
,
"
accuracy - input-and-label contaminated
"
:
76
,
"
accuracy - not labeled
"
:
0
},
'
business_ethics
'
:
{
'
accuracy - clean
'
:
44
,
'
accuracy - input contaminated
'
:
16
,
'
accuracy - input-and-label contaminated
'
:
38
,
'
accuracy - not labeled
'
:
1
},
'
security_studies
'
:
{
'
accuracy - clean
'
:
188
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
47
,
'
accuracy - not labeled
'
:
0
},
'
high_school_us_history
'
:
{
'
accuracy - clean
'
:
42
,
'
accuracy - input contaminated
'
:
0
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
161
},
'
moral_disputes
'
:
{
'
accuracy - clean
'
:
105
,
'
accuracy - input contaminated
'
:
13
,
'
accuracy - input-and-label contaminated
'
:
168
,
'
accuracy - not labeled
'
:
59
},
'
philosophy
'
:
{
'
accuracy - clean
'
:
81
,
'
accuracy - input contaminated
'
:
11
,
'
accuracy - input-and-label contaminated
'
:
187
,
'
accuracy - not labeled
'
:
31
},
'
public_relations
'
:
{
'
accuracy - clean
'
:
75
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
26
,
'
accuracy - not labeled
'
:
0
},
'
high_school_microeconomics
'
:
{
'
accuracy - clean
'
:
82
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
146
,
'
accuracy - not labeled
'
:
0
},
'
human_sexuality
'
:
{
'
accuracy - clean
'
:
108
,
'
accuracy - input contaminated
'
:
3
,
'
accuracy - input-and-label contaminated
'
:
15
,
'
accuracy - not labeled
'
:
4
},
'
professional_accounting
'
:
{
'
accuracy - clean
'
:
88
,
'
accuracy - input contaminated
'
:
40
,
'
accuracy - input-and-label contaminated
'
:
152
,
'
accuracy - not labeled
'
:
1
},
'
high_school_government_and_politics
'
:
{
'
accuracy - clean
'
:
104
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
82
,
'
accuracy - not labeled
'
:
0
},
'
sociology
'
:
{
'
accuracy - clean
'
:
105
,
'
accuracy - input contaminated
'
:
4
,
'
accuracy - input-and-label contaminated
'
:
91
,
'
accuracy - not labeled
'
:
0
},
'
conceptual_physics
'
:
{
'
accuracy - clean
'
:
79
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
147
,
'
accuracy - not labeled
'
:
0
},
'
human_aging
'
:
{
'
accuracy - clean
'
:
208
,
'
accuracy - input contaminated
'
:
1
,
'
accuracy - input-and-label contaminated
'
:
13
,
'
accuracy - not labeled
'
:
0
},
'
high_school_psychology
'
:
{
'
accuracy - clean
'
:
108
,
'
accuracy - input contaminated
'
:
26
,
'
accuracy - input-and-label contaminated
'
:
162
,
'
accuracy - not labeled
'
:
248
},
'
jurisprudence
'
:
{
'
accuracy - clean
'
:
59
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
43
,
'
accuracy - not labeled
'
:
0
},
'
moral_scenarios
'
:
{
'
accuracy - clean
'
:
320
,
'
accuracy - input contaminated
'
:
0
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
574
},
'
college_medicine
'
:
{
'
accuracy - clean
'
:
107
,
'
accuracy - input contaminated
'
:
16
,
'
accuracy - input-and-label contaminated
'
:
44
,
'
accuracy - not labeled
'
:
5
},
'
high_school_world_history
'
:
{
'
accuracy - clean
'
:
61
,
'
accuracy - input contaminated
'
:
2
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
173
},
'
virology
'
:
{
'
accuracy - clean
'
:
104
,
'
accuracy - input contaminated
'
:
3
,
'
accuracy - input-and-label contaminated
'
:
58
,
'
accuracy - not labeled
'
:
0
},
'
high_school_statistics
'
:
{
'
accuracy - clean
'
:
96
,
'
accuracy - input contaminated
'
:
43
,
'
accuracy - input-and-label contaminated
'
:
76
,
'
accuracy - not labeled
'
:
0
},
'
nutrition
'
:
{
'
accuracy - clean
'
:
172
,
'
accuracy - input contaminated
'
:
11
,
'
accuracy - input-and-label contaminated
'
:
98
,
'
accuracy - not labeled
'
:
24
},
'
abstract_algebra
'
:
{
'
accuracy - clean
'
:
84
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
7
,
'
accuracy - not labeled
'
:
0
},
'
high_school_geography
'
:
{
'
accuracy - clean
'
:
91
,
'
accuracy - input contaminated
'
:
1
,
'
accuracy - input-and-label contaminated
'
:
105
,
'
accuracy - not labeled
'
:
0
},
'
econometrics
'
:
{
'
accuracy - clean
'
:
62
,
'
accuracy - input contaminated
'
:
13
,
'
accuracy - input-and-label contaminated
'
:
38
,
'
accuracy - not labeled
'
:
0
},
'
marketing
'
:
{
'
accuracy - clean
'
:
115
,
'
accuracy - input contaminated
'
:
15
,
'
accuracy - input-and-label contaminated
'
:
101
,
'
accuracy - not labeled
'
:
2
},
'
high_school_chemistry
'
:
{
'
accuracy - clean
'
:
108
,
'
accuracy - input contaminated
'
:
25
,
'
accuracy - input-and-label contaminated
'
:
69
,
'
accuracy - not labeled
'
:
0
},
'
prehistory
'
:
{
'
accuracy - clean
'
:
154
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
107
,
'
accuracy - not labeled
'
:
57
},
'
college_physics
'
:
{
'
accuracy - clean
'
:
25
,
'
accuracy - input contaminated
'
:
20
,
'
accuracy - input-and-label contaminated
'
:
57
,
'
accuracy - not labeled
'
:
0
},
'
management
'
:
{
'
accuracy - clean
'
:
35
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
62
,
'
accuracy - not labeled
'
:
0
},
'
college_biology
'
:
{
'
accuracy - clean
'
:
91
,
'
accuracy - input contaminated
'
:
12
,
'
accuracy - input-and-label contaminated
'
:
40
,
'
accuracy - not labeled
'
:
0
},
'
high_school_biology
'
:
{
'
accuracy - clean
'
:
128
,
'
accuracy - input contaminated
'
:
17
,
'
accuracy - input-and-label contaminated
'
:
135
,
'
accuracy - not labeled
'
:
29
},
'
high_school_physics
'
:
{
'
accuracy - clean
'
:
42
,
'
accuracy - input contaminated
'
:
28
,
'
accuracy - input-and-label contaminated
'
:
80
,
'
accuracy - not labeled
'
:
0
},
'
logical_fallacies
'
:
{
'
accuracy - clean
'
:
133
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
24
,
'
accuracy - not labeled
'
:
0
},
'
medical_genetics
'
:
{
'
accuracy - clean
'
:
49
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
43
,
'
accuracy - not labeled
'
:
1
},
'
machine_learning
'
:
{
'
accuracy - clean
'
:
71
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
32
,
'
accuracy - not labeled
'
:
0
},
'
professional_law
'
:
{
'
accuracy - clean
'
:
401
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
5
,
'
accuracy - not labeled
'
:
1119
},
'
professional_psychology
'
:
{
'
accuracy - clean
'
:
265
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
27
,
'
accuracy - not labeled
'
:
310
},
'
global_facts
'
:
{
'
accuracy - clean
'
:
89
,
'
accuracy - input contaminated
'
:
5
,
'
accuracy - input-and-label contaminated
'
:
5
,
'
accuracy - not labeled
'
:
0
},
'
us_foreign_policy
'
:
{
'
accuracy - clean
'
:
71
,
'
accuracy - input contaminated
'
:
3
,
'
accuracy - input-and-label contaminated
'
:
25
,
'
accuracy - not labeled
'
:
0
},
'
international_law
'
:
{
'
accuracy - clean
'
:
73
,
'
accuracy - input contaminated
'
:
1
,
'
accuracy - input-and-label contaminated
'
:
46
,
'
accuracy - not labeled
'
:
0
},
'
clinical_knowledge
'
:
{
'
accuracy - clean
'
:
172
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
86
,
'
accuracy - not labeled
'
:
0
},
'
high_school_mathematics
'
:
{
'
accuracy - clean
'
:
178
,
'
accuracy - input contaminated
'
:
59
,
'
accuracy - input-and-label contaminated
'
:
32
,
'
accuracy - not labeled
'
:
0
},
'
high_school_computer_science
'
:
{
'
accuracy - clean
'
:
62
,
'
accuracy - input contaminated
'
:
7
,
'
accuracy - input-and-label contaminated
'
:
28
,
'
accuracy - not labeled
'
:
2
},
'
college_computer_science
'
:
{
'
accuracy - clean
'
:
68
,
'
accuracy - input contaminated
'
:
15
,
'
accuracy - input-and-label contaminated
'
:
15
,
'
accuracy - not labeled
'
:
1
},
'
electrical_engineering
'
:
{
'
accuracy - clean
'
:
75
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
61
,
'
accuracy - not labeled
'
:
0
},
'
college_mathematics
'
:
{
'
accuracy - clean
'
:
61
,
'
accuracy - input contaminated
'
:
13
,
'
accuracy - input-and-label contaminated
'
:
26
,
'
accuracy - not labeled
'
:
0
},
'
computer_security
'
:
{
'
accuracy - clean
'
:
55
,
'
accuracy - input contaminated
'
:
8
,
'
accuracy - input-and-label contaminated
'
:
36
,
'
accuracy - not labeled
'
:
0
},
'
high_school_macroeconomics
'
:
{
'
accuracy - clean
'
:
102
,
'
accuracy - input contaminated
'
:
14
,
'
accuracy - input-and-label contaminated
'
:
173
,
'
accuracy - not labeled
'
:
100
},
'
astronomy
'
:
{
'
accuracy - clean
'
:
112
,
'
accuracy - input contaminated
'
:
4
,
'
accuracy - input-and-label contaminated
'
:
35
,
'
accuracy - not labeled
'
:
0
},
'
college_chemistry
'
:
{
'
accuracy - clean
'
:
46
,
'
accuracy - input contaminated
'
:
19
,
'
accuracy - input-and-label contaminated
'
:
34
,
'
accuracy - not labeled
'
:
0
},
'
high_school_european_history
'
:
{
'
accuracy - clean
'
:
41
,
'
accuracy - input contaminated
'
:
0
,
'
accuracy - input-and-label contaminated
'
:
0
,
'
accuracy - not labeled
'
:
123
},
'
miscellaneous
'
:
{
'
accuracy - clean
'
:
256
,
'
accuracy - input contaminated
'
:
9
,
'
accuracy - input-and-label contaminated
'
:
40
,
'
accuracy - not labeled
'
:
477
},
'
formal_logic
'
:
{
'
accuracy - clean
'
:
92
,
'
accuracy - input contaminated
'
:
12
,
'
accuracy - input-and-label contaminated
'
:
21
,
'
accuracy - not labeled
'
:
0
},
'
elementary_mathematics
'
:
{
'
accuracy - clean
'
:
155
,
'
accuracy - input contaminated
'
:
31
,
'
accuracy - input-and-label contaminated
'
:
103
,
'
accuracy - not labeled
'
:
88
},
'
world_religions
'
:
{
'
accuracy - clean
'
:
130
,
'
accuracy - input contaminated
'
:
4
,
'
accuracy - input-and-label contaminated
'
:
36
,
'
accuracy - not labeled
'
:
0
},
'
professional_medicine
'
:
{
'
accuracy - clean
'
:
191
,
'
accuracy - input contaminated
'
:
43
,
'
accuracy - input-and-label contaminated
'
:
1
,
'
accuracy - not labeled
'
:
36
},
'
anatomy
'
:
{
'
accuracy - clean
'
:
52
,
'
accuracy - input contaminated
'
:
6
,
'
accuracy - input-and-label contaminated
'
:
76
,
'
accuracy - not labeled
'
:
0
},
}
...
...
@@ -166,7 +166,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights'
:
weights
,
}
)
for
dataset_abbr
,
subsets
in
mmlu_name_and_subsets
:
weights
=
{
f
'lukaemon_mmlu_
{
i
}
'
:
mmlu_category_weights
[
i
][
metric_name
]
for
i
in
subsets
}
subsets
=
[[
f
'lukaemon_mmlu_
{
i
}
'
,
metric_name
]
for
i
in
subsets
]
...
...
@@ -178,7 +178,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights'
:
weights
,
}
)
summary_groups
.
append
(
{
'name'
:
'hellaswag'
,
...
...
configs/summarizers/example.py
View file @
aa2dd2b5
...
...
@@ -14,5 +14,5 @@ with read_base():
from
.groups.mgsm
import
mgsm_summary_groups
summarizer
=
dict
(
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/groups/MMLUArabic.py
View file @
aa2dd2b5
sub_categories
=
{
'math'
:
[
'abstract_algebra'
,
'college_mathematics'
,
'elementary_mathematics'
,
'high_school_mathematics'
,
'high_school_statistics'
],
'health'
:
[
'anatomy'
,
'clinical_knowledge'
,
'college_medicine'
,
'human_aging'
,
'medical_genetics'
,
'nutrition'
,
'professional_medicine'
,
'virology'
],
'physics'
:
[
'astronomy'
,
'college_physics'
,
'conceptual_physics'
,
'high_school_physics'
],
'business'
:
[
'business_ethics'
,
'management'
,
'marketing'
],
'biology'
:
[
'college_biology'
,
'high_school_biology'
],
'chemistry'
:
[
'college_chemistry'
,
'high_school_chemistry'
],
'computer science'
:
[
'college_computer_science'
,
'computer_security'
,
'high_school_computer_science'
,
'machine_learning'
],
'economics'
:
[
'econometrics'
,
'high_school_macroeconomics'
,
'high_school_microeconomics'
],
'engineering'
:
[
'electrical_engineering'
],
'philosophy'
:
[
'formal_logic'
,
'logical_fallacies'
,
'moral_disputes'
,
'moral_scenarios'
,
'philosophy'
,
'world_religions'
],
'other'
:
[
'global_facts'
,
'miscellaneous'
,
'professional_accounting'
],
'history'
:
[
'high_school_european_history'
,
'high_school_us_history'
,
'high_school_world_history'
,
'prehistory'
],
'geography'
:
[
'high_school_geography'
],
'politics'
:
[
'high_school_government_and_politics'
,
'public_relations'
,
'security_studies'
,
'us_foreign_policy'
],
'psychology'
:
[
'high_school_psychology'
,
'professional_psychology'
],
'culture'
:
[
'human_sexuality'
,
'sociology'
],
'math'
:
[
'abstract_algebra'
,
'college_mathematics'
,
'elementary_mathematics'
,
'high_school_mathematics'
,
'high_school_statistics'
],
'health'
:
[
'anatomy'
,
'clinical_knowledge'
,
'college_medicine'
,
'human_aging'
,
'medical_genetics'
,
'nutrition'
,
'professional_medicine'
,
'virology'
],
'physics'
:
[
'astronomy'
,
'college_physics'
,
'conceptual_physics'
,
'high_school_physics'
],
'business'
:
[
'business_ethics'
,
'management'
,
'marketing'
],
'biology'
:
[
'college_biology'
,
'high_school_biology'
],
'chemistry'
:
[
'college_chemistry'
,
'high_school_chemistry'
],
'computer science'
:
[
'college_computer_science'
,
'computer_security'
,
'high_school_computer_science'
,
'machine_learning'
],
'economics'
:
[
'econometrics'
,
'high_school_macroeconomics'
,
'high_school_microeconomics'
],
'engineering'
:
[
'electrical_engineering'
],
'philosophy'
:
[
'formal_logic'
,
'logical_fallacies'
,
'moral_disputes'
,
'moral_scenarios'
,
'philosophy'
,
'world_religions'
],
'other'
:
[
'global_facts'
,
'miscellaneous'
,
'professional_accounting'
],
'history'
:
[
'high_school_european_history'
,
'high_school_us_history'
,
'high_school_world_history'
,
'prehistory'
],
'geography'
:
[
'high_school_geography'
],
'politics'
:
[
'high_school_government_and_politics'
,
'public_relations'
,
'security_studies'
,
'us_foreign_policy'
],
'psychology'
:
[
'high_school_psychology'
,
'professional_psychology'
],
'culture'
:
[
'human_sexuality'
,
'sociology'
],
'law'
:
[
'international_law'
,
'jurisprudence'
,
'professional_law'
]
}
categories
=
{
"
STEM
"
:
[
"
physics
"
,
"
chemistry
"
,
"
biology
"
,
"
computer science
"
,
"
math
"
,
"
engineering
"
],
"
humanities
"
:
[
"
history
"
,
"
philosophy
"
,
"
law
"
],
"
social_sciences
"
:
[
"
politics
"
,
"
culture
"
,
"
economics
"
,
"
geography
"
,
"
psychology
"
],
"
other
"
:
[
"
other
"
,
"
business
"
,
"
health
"
],
'
STEM
'
:
[
'
physics
'
,
'
chemistry
'
,
'
biology
'
,
'
computer science
'
,
'
math
'
,
'
engineering
'
],
'
humanities
'
:
[
'
history
'
,
'
philosophy
'
,
'
law
'
],
'
social_sciences
'
:
[
'
politics
'
,
'
culture
'
,
'
economics
'
,
'
geography
'
,
'
psychology
'
],
'
other
'
:
[
'
other
'
,
'
business
'
,
'
health
'
],
}
category2subject
=
{}
...
...
configs/summarizers/groups/cibench.py
View file @
aa2dd2b5
...
...
@@ -392,4 +392,4 @@ cibench_summary_groups.extend([
'subsets'
:
[
i
[:
2
]
for
i
in
cibench_math
],
'weights'
:
{
f
'
{
k
[
0
]
}
@
{
k
[
1
]
}
'
:
k
[
-
1
]
for
k
in
cibench_math
},
},
])
\ No newline at end of file
])
configs/summarizers/groups/cmmlu.py
View file @
aa2dd2b5
subcategories
=
{
"
agronomy
"
:
[
'other'
],
"
anatomy
"
:
[
'biology'
],
"
ancient_chinese
"
:
[
'linguistics'
,
'china specific'
],
"
arts
"
:
[
'arts'
],
"
astronomy
"
:
[
'physics'
],
"
business_ethics
"
:
[
'business'
],
"
chinese_civil_service_exam
"
:
[
'politics'
,
'china specific'
],
"
chinese_driving_rule
"
:
[
'other'
,
'china specific'
],
"
chinese_food_culture
"
:
[
'culture'
,
'china specific'
],
"
chinese_foreign_policy
"
:
[
'politics'
,
'china specific'
],
"
chinese_history
"
:[
'history'
,
'china specific'
],
"
chinese_literature
"
:
[
'literature'
,
'china specific'
],
"
chinese_teacher_qualification
"
:
[
'education'
,
'china specific'
],
"
college_actuarial_science
"
:[
'math'
],
"
college_education
"
:[
'education'
],
"
college_engineering_hydrology
"
:
[
'engineering'
],
"
college_law
"
:
[
'law'
],
"
college_mathematics
"
:
[
'math'
],
"
college_medical_statistics
"
:[
'statistics'
],
"
clinical_knowledge
"
:
[
'other'
],
"
college_medicine
"
:
[
'other'
],
"
computer_science
"
:
[
'computer science'
],
"
computer_security
"
:
[
'other'
],
"
conceptual_physics
"
:
[
'physics'
],
"
construction_project_management
"
:
[
'other'
,
'china specific'
],
"
economics
"
:
[
'economics'
],
"
education
"
:
[
'education'
],
"
elementary_chinese
"
:[
'linguistics'
,
'china specific'
],
"
elementary_commonsense
"
:[
'other'
,
'china specific'
],
"
elementary_information_and_technology
"
:
[
'other'
],
"
electrical_engineering
"
:
[
'engineering'
],
"
elementary_mathematics
"
:
[
'math'
],
"
ethnology
"
:
[
'culture'
,
'china specific'
],
"
food_science
"
:
[
'other'
],
"
genetics
"
:
[
'biology'
],
"
global_facts
"
:
[
'global'
],
"
high_school_biology
"
:
[
'biology'
],
"
high_school_chemistry
"
:
[
'chemistry'
],
"
high_school_geography
"
:
[
'geography'
],
"
high_school_mathematics
"
:
[
'math'
],
"
high_school_physics
"
:
[
'physics'
],
"
high_school_politics
"
:
[
'politics'
,
'china specific'
],
"
human_sexuality
"
:
[
'other'
],
"
international_law
"
:
[
'law'
],
"
journalism
"
:
[
'sociology'
],
"
jurisprudence
"
:
[
'law'
],
"
legal_and_moral_basis
"
:
[
'other'
],
"
logical
"
:
[
'philosophy'
],
"
machine_learning
"
:
[
'computer science'
],
"
management
"
:
[
'business'
],
"
marketing
"
:
[
'business'
],
"
marxist_theory
"
:
[
'philosophy'
],
"
modern_chinese
"
:
[
'linguistics'
,
'china specific'
],
"
nutrition
"
:
[
'other'
],
"
philosophy
"
:
[
'philosophy'
],
"
professional_accounting
"
:
[
'business'
],
"
professional_law
"
:
[
'law'
],
"
professional_medicine
"
:
[
'other'
],
"
professional_psychology
"
:
[
'psychology'
],
"
public_relations
"
:
[
'politics'
],
"
security_study
"
:
[
'politics'
],
"
sociology
"
:
[
'culture'
],
"
sports_science
"
:
[
'other'
],
"
traditional_chinese_medicine
"
:
[
'other'
,
'china specific'
],
"
virology
"
:
[
'biology'
],
"
world_history
"
:[
'history'
],
"
world_religions
"
:
[
'global'
],
'
agronomy
'
:
[
'other'
],
'
anatomy
'
:
[
'biology'
],
'
ancient_chinese
'
:
[
'linguistics'
,
'china specific'
],
'
arts
'
:
[
'arts'
],
'
astronomy
'
:
[
'physics'
],
'
business_ethics
'
:
[
'business'
],
'
chinese_civil_service_exam
'
:
[
'politics'
,
'china specific'
],
'
chinese_driving_rule
'
:
[
'other'
,
'china specific'
],
'
chinese_food_culture
'
:
[
'culture'
,
'china specific'
],
'
chinese_foreign_policy
'
:
[
'politics'
,
'china specific'
],
'
chinese_history
'
:[
'history'
,
'china specific'
],
'
chinese_literature
'
:
[
'literature'
,
'china specific'
],
'
chinese_teacher_qualification
'
:
[
'education'
,
'china specific'
],
'
college_actuarial_science
'
:[
'math'
],
'
college_education
'
:[
'education'
],
'
college_engineering_hydrology
'
:
[
'engineering'
],
'
college_law
'
:
[
'law'
],
'
college_mathematics
'
:
[
'math'
],
'
college_medical_statistics
'
:[
'statistics'
],
'
clinical_knowledge
'
:
[
'other'
],
'
college_medicine
'
:
[
'other'
],
'
computer_science
'
:
[
'computer science'
],
'
computer_security
'
:
[
'other'
],
'
conceptual_physics
'
:
[
'physics'
],
'
construction_project_management
'
:
[
'other'
,
'china specific'
],
'
economics
'
:
[
'economics'
],
'
education
'
:
[
'education'
],
'
elementary_chinese
'
:[
'linguistics'
,
'china specific'
],
'
elementary_commonsense
'
:[
'other'
,
'china specific'
],
'
elementary_information_and_technology
'
:
[
'other'
],
'
electrical_engineering
'
:
[
'engineering'
],
'
elementary_mathematics
'
:
[
'math'
],
'
ethnology
'
:
[
'culture'
,
'china specific'
],
'
food_science
'
:
[
'other'
],
'
genetics
'
:
[
'biology'
],
'
global_facts
'
:
[
'global'
],
'
high_school_biology
'
:
[
'biology'
],
'
high_school_chemistry
'
:
[
'chemistry'
],
'
high_school_geography
'
:
[
'geography'
],
'
high_school_mathematics
'
:
[
'math'
],
'
high_school_physics
'
:
[
'physics'
],
'
high_school_politics
'
:
[
'politics'
,
'china specific'
],
'
human_sexuality
'
:
[
'other'
],
'
international_law
'
:
[
'law'
],
'
journalism
'
:
[
'sociology'
],
'
jurisprudence
'
:
[
'law'
],
'
legal_and_moral_basis
'
:
[
'other'
],
'
logical
'
:
[
'philosophy'
],
'
machine_learning
'
:
[
'computer science'
],
'
management
'
:
[
'business'
],
'
marketing
'
:
[
'business'
],
'
marxist_theory
'
:
[
'philosophy'
],
'
modern_chinese
'
:
[
'linguistics'
,
'china specific'
],
'
nutrition
'
:
[
'other'
],
'
philosophy
'
:
[
'philosophy'
],
'
professional_accounting
'
:
[
'business'
],
'
professional_law
'
:
[
'law'
],
'
professional_medicine
'
:
[
'other'
],
'
professional_psychology
'
:
[
'psychology'
],
'
public_relations
'
:
[
'politics'
],
'
security_study
'
:
[
'politics'
],
'
sociology
'
:
[
'culture'
],
'
sports_science
'
:
[
'other'
],
'
traditional_chinese_medicine
'
:
[
'other'
,
'china specific'
],
'
virology
'
:
[
'biology'
],
'
world_history
'
:[
'history'
],
'
world_religions
'
:
[
'global'
],
}
categories
=
{
"
STEM
"
:
[
"
physics
"
,
"
chemistry
"
,
"
biology
"
,
"
computer science
"
,
"
math
"
,
"
engineering
"
,
"
statistics
"
],
"
Humanities
"
:
[
"
history
"
,
"
philosophy
"
,
"
law
"
,
"
arts
"
,
"
literature
"
,
"
global
"
],
"
Social Science
"
:
[
'linguistics'
,
"
business
"
,
"
politics
"
,
"
culture
"
,
"
economics
"
,
"
geography
"
,
"
psychology
"
,
"
education
"
,
"
sociology
"
],
"
Other
"
:[
"
other
"
],
"
China specific
"
:
[
"
china specific
"
],
'
STEM
'
:
[
'
physics
'
,
'
chemistry
'
,
'
biology
'
,
'
computer science
'
,
'
math
'
,
'
engineering
'
,
'
statistics
'
],
'
Humanities
'
:
[
'
history
'
,
'
philosophy
'
,
'
law
'
,
'
arts
'
,
'
literature
'
,
'
global
'
],
'
Social Science
'
:
[
'linguistics'
,
'
business
'
,
'
politics
'
,
'
culture
'
,
'
economics
'
,
'
geography
'
,
'
psychology
'
,
'
education
'
,
'
sociology
'
],
'
Other
'
:[
'
other
'
],
'
China specific
'
:
[
'
china specific
'
],
}
category2subject
=
{}
...
...
configs/summarizers/groups/lawbench.py
View file @
aa2dd2b5
names
=
[
[
"
1-1
"
,
"
article_recitation
"
],
[
"
1-2
"
,
"
knowledge_question_answering
"
],
[
"
2-1
"
,
"
document_proofreading
"
],
[
"
2-2
"
,
"
dispute_focus_identification
"
],
[
"
2-3
"
,
"
marital_disputes_identification
"
],
[
"
2-4
"
,
"
issue_topic_identification
"
],
[
"
2-5
"
,
"
reading_comprehension
"
],
[
"
2-6
"
,
"
named_entity_recognition
"
],
[
"
2-7
"
,
"
opinion_summarization
"
],
[
"
2-8
"
,
"
argument_mining
"
],
[
"
2-9
"
,
"
event_detection
"
],
[
"
2-10
"
,
"
trigger_word_extraction
"
],
[
"
3-1
"
,
"
fact_based_article_prediction
"
],
[
"
3-2
"
,
"
scene_based_article_prediction
"
],
[
"
3-3
"
,
"
charge_prediction
"
],
[
"
3-4
"
,
"
prison_term_prediction_wo_article
"
],
[
"
3-5
"
,
"
prison_term_prediction_w_article
"
],
[
"
3-6
"
,
"
case_analysis
"
],
[
"
3-7
"
,
"
criminal_damages_calculation
"
],
[
"
3-8
"
,
"
consultation
"
],
[
'
1-1
'
,
'
article_recitation
'
],
[
'
1-2
'
,
'
knowledge_question_answering
'
],
[
'
2-1
'
,
'
document_proofreading
'
],
[
'
2-2
'
,
'
dispute_focus_identification
'
],
[
'
2-3
'
,
'
marital_disputes_identification
'
],
[
'
2-4
'
,
'
issue_topic_identification
'
],
[
'
2-5
'
,
'
reading_comprehension
'
],
[
'
2-6
'
,
'
named_entity_recognition
'
],
[
'
2-7
'
,
'
opinion_summarization
'
],
[
'
2-8
'
,
'
argument_mining
'
],
[
'
2-9
'
,
'
event_detection
'
],
[
'
2-10
'
,
'
trigger_word_extraction
'
],
[
'
3-1
'
,
'
fact_based_article_prediction
'
],
[
'
3-2
'
,
'
scene_based_article_prediction
'
],
[
'
3-3
'
,
'
charge_prediction
'
],
[
'
3-4
'
,
'
prison_term_prediction_wo_article
'
],
[
'
3-5
'
,
'
prison_term_prediction_w_article
'
],
[
'
3-6
'
,
'
case_analysis
'
],
[
'
3-7
'
,
'
criminal_damages_calculation
'
],
[
'
3-8
'
,
'
consultation
'
],
]
lawbench_summary_groups
=
[]
...
...
configs/summarizers/groups/leval.py
View file @
aa2dd2b5
leval_summary_groups
=
[
{
"
name
"
:
"
leval
"
,
"
subsets
"
:
[
"
LEval_coursera
"
,
"
LEval_gsm100
"
,
"
LEval_quality
"
,
"
LEval_tpo
"
,
"
LEval_topic_retrieval
"
,
"
LEval_financialqa
"
,
"
LEval_gov_report_summ
"
,
"
LEval_legal_contract_qa
"
,
"
LEval_meeting_summ
"
,
"
LEval_multidocqa
"
,
"
LEval_narrativeqa
"
,
"
LEval_nq
"
,
"
LEval_news_summ
"
,
"
LEval_paper_assistant
"
,
"
LEval_patent_summ
"
,
"
LEval_review_summ
"
,
"
LEval_scientificqa
"
,
"
LEval_tvshow_summ
"
]},
{
'
name
'
:
'
leval
'
,
'
subsets
'
:
[
'
LEval_coursera
'
,
'
LEval_gsm100
'
,
'
LEval_quality
'
,
'
LEval_tpo
'
,
'
LEval_topic_retrieval
'
,
'
LEval_financialqa
'
,
'
LEval_gov_report_summ
'
,
'
LEval_legal_contract_qa
'
,
'
LEval_meeting_summ
'
,
'
LEval_multidocqa
'
,
'
LEval_narrativeqa
'
,
'
LEval_nq
'
,
'
LEval_news_summ
'
,
'
LEval_paper_assistant
'
,
'
LEval_patent_summ
'
,
'
LEval_review_summ
'
,
'
LEval_scientificqa
'
,
'
LEval_tvshow_summ
'
]},
]
configs/summarizers/groups/lveval.py
View file @
aa2dd2b5
len_levels
=
[
"
16k
"
,
"
32k
"
,
"
64k
"
,
"
128k
"
,
"
256k
"
]
len_levels
=
[
'
16k
'
,
'
32k
'
,
'
64k
'
,
'
128k
'
,
'
256k
'
]
subsets_lveval_loogle_SD_mixup
=
[
"
LVEval_loogle_SD_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_loogle_SD_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_cmrc_mixup
=
[
"
LVEval_cmrc_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_cmrc_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_multifieldqa_en_mixup
=
[
"
LVEval_multifieldqa_en_mixup
"
+
"_"
+
len_level
'
LVEval_multifieldqa_en_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_multifieldqa_zh_mixup
=
[
"
LVEval_multifieldqa_zh_mixup
"
+
"_"
+
len_level
'
LVEval_multifieldqa_zh_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_dureader_mixup
=
[
"
LVEval_dureader_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_dureader_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_loogle_CR_mixup
=
[
"
LVEval_loogle_CR_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_loogle_CR_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_loogle_MIR_mixup
=
[
"
LVEval_loogle_MIR_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_loogle_MIR_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_hotpotwikiqa_mixup
=
[
"
LVEval_hotpotwikiqa_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_hotpotwikiqa_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_lic_mixup
=
[
"
LVEval_lic_mixup
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_lic_mixup
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_factrecall_en
=
[
"
LVEval_factrecall_en
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_factrecall_en
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_factrecall_zh
=
[
"
LVEval_factrecall_zh
"
+
"_"
+
len_level
for
len_level
in
len_levels
'
LVEval_factrecall_zh
'
+
'_'
+
len_level
for
len_level
in
len_levels
]
subsets_lveval_single_hop_qa
=
(
...
...
@@ -64,47 +64,47 @@ subsets_lveval_qa = (
lveval_summary_groups
=
[
{
"
name
"
:
"
LVEval_loogle_SD_mixup
"
,
"
subsets
"
:
subsets_lveval_loogle_SD_mixup
,
'
name
'
:
'
LVEval_loogle_SD_mixup
'
,
'
subsets
'
:
subsets_lveval_loogle_SD_mixup
,
},
{
"
name
"
:
"
LVEval_cmrc_mixup
"
,
"
subsets
"
:
subsets_lveval_cmrc_mixup
},
{
'
name
'
:
'
LVEval_cmrc_mixup
'
,
'
subsets
'
:
subsets_lveval_cmrc_mixup
},
{
"
name
"
:
"
LVEval_multifieldqa_en_mixup
"
,
"
subsets
"
:
subsets_lveval_multifieldqa_en_mixup
,
'
name
'
:
'
LVEval_multifieldqa_en_mixup
'
,
'
subsets
'
:
subsets_lveval_multifieldqa_en_mixup
,
},
{
"
name
"
:
"
LVEval_multifieldqa_zh_mixup
"
,
"
subsets
"
:
subsets_lveval_multifieldqa_zh_mixup
,
'
name
'
:
'
LVEval_multifieldqa_zh_mixup
'
,
'
subsets
'
:
subsets_lveval_multifieldqa_zh_mixup
,
},
{
"
name
"
:
"
LVEval_dureader_mixup
"
,
"
subsets
"
:
subsets_lveval_dureader_mixup
,
'
name
'
:
'
LVEval_dureader_mixup
'
,
'
subsets
'
:
subsets_lveval_dureader_mixup
,
},
{
"
name
"
:
"
LVEval_loogle_CR_mixup
"
,
"
subsets
"
:
subsets_lveval_loogle_CR_mixup
,
'
name
'
:
'
LVEval_loogle_CR_mixup
'
,
'
subsets
'
:
subsets_lveval_loogle_CR_mixup
,
},
{
"
name
"
:
"
LVEval_loogle_MIR_mixup
"
,
"
subsets
"
:
subsets_lveval_loogle_MIR_mixup
,
'
name
'
:
'
LVEval_loogle_MIR_mixup
'
,
'
subsets
'
:
subsets_lveval_loogle_MIR_mixup
,
},
{
"
name
"
:
"
LVEval_hotpotwikiqa_mixup
"
,
"
subsets
"
:
subsets_lveval_hotpotwikiqa_mixup
,
'
name
'
:
'
LVEval_hotpotwikiqa_mixup
'
,
'
subsets
'
:
subsets_lveval_hotpotwikiqa_mixup
,
},
{
"
name
"
:
"
LVEval_lic_mixup
"
,
"
subsets
"
:
subsets_lveval_lic_mixup
},
{
"
name
"
:
"
LVEval_factrecall_en
"
,
"
subsets
"
:
subsets_lveval_factrecall_en
},
{
"
name
"
:
"
LVEval_factrecall_zh
"
,
"
subsets
"
:
subsets_lveval_factrecall_zh
},
{
"
name
"
:
"
LVEval_single_hop_qa
"
,
"
subsets
"
:
subsets_lveval_single_hop_qa
},
{
'
name
'
:
'
LVEval_lic_mixup
'
,
'
subsets
'
:
subsets_lveval_lic_mixup
},
{
'
name
'
:
'
LVEval_factrecall_en
'
,
'
subsets
'
:
subsets_lveval_factrecall_en
},
{
'
name
'
:
'
LVEval_factrecall_zh
'
,
'
subsets
'
:
subsets_lveval_factrecall_zh
},
{
'
name
'
:
'
LVEval_single_hop_qa
'
,
'
subsets
'
:
subsets_lveval_single_hop_qa
},
{
"
name
"
:
"
LVEval_single_hop_cqa
"
,
"
subsets
"
:
subsets_lveval_single_hop_cqa
,
'
name
'
:
'
LVEval_single_hop_cqa
'
,
'
subsets
'
:
subsets_lveval_single_hop_cqa
,
},
{
"
name
"
:
"
LVEval_multi_hop_qa
"
,
"
subsets
"
:
subsets_lveval_multi_hop_qa
},
{
"
name
"
:
"
LVEval_multi_hop_cqa
"
,
"
subsets
"
:
subsets_lveval_multi_hop_cqa
},
{
'
name
'
:
'
LVEval_multi_hop_qa
'
,
'
subsets
'
:
subsets_lveval_multi_hop_qa
},
{
'
name
'
:
'
LVEval_multi_hop_cqa
'
,
'
subsets
'
:
subsets_lveval_multi_hop_cqa
},
{
"
name
"
:
"
LVEval_factrecall_cqa
"
,
"
subsets
"
:
subsets_lveval_factrecall_cqa
,
'
name
'
:
'
LVEval_factrecall_cqa
'
,
'
subsets
'
:
subsets_lveval_factrecall_cqa
,
},
{
"
name
"
:
"
LVEval_qa
"
,
"
subsets
"
:
subsets_lveval_qa
},
{
'
name
'
:
'
LVEval_qa
'
,
'
subsets
'
:
subsets_lveval_qa
},
]
configs/summarizers/groups/mgsm.py
View file @
aa2dd2b5
ALL_LANGUAGES
=
[
"
bn
"
,
"
de
"
,
"
en
"
,
"
es
"
,
"
fr
"
,
"
ja
"
,
"
ru
"
,
"
sw
"
,
"
te
"
,
"
th
"
,
"
zh
"
]
LATIN_LANGUAGES
=
[
"
de
"
,
"
en
"
,
"
es
"
,
"
fr
"
,
"
sw
"
]
NON_LATIN_LANGUAGES
=
[
"
bn
"
,
"
ja
"
,
"
ru
"
,
"
te
"
,
"
th
"
,
"
zh
"
]
ALL_LANGUAGES
=
[
'
bn
'
,
'
de
'
,
'
en
'
,
'
es
'
,
'
fr
'
,
'
ja
'
,
'
ru
'
,
'
sw
'
,
'
te
'
,
'
th
'
,
'
zh
'
]
LATIN_LANGUAGES
=
[
'
de
'
,
'
en
'
,
'
es
'
,
'
fr
'
,
'
sw
'
]
NON_LATIN_LANGUAGES
=
[
'
bn
'
,
'
ja
'
,
'
ru
'
,
'
te
'
,
'
th
'
,
'
zh
'
]
mgsm_summary_groups
=
[
{
'name'
:
'mgsm_latin'
,
'subsets'
:
[
f
'mgsm_
{
lang
}
'
for
lang
in
LATIN_LANGUAGES
]},
...
...
configs/summarizers/groups/scibench.py
View file @
aa2dd2b5
scibench_summary_groups
=
[]
scibench_tasks
=
[
"
atkins
"
,
"
calculus
"
,
"
chemmc
"
,
"
class
"
,
"
diff
"
,
"
fund
"
,
"
matter
"
,
"
quan
"
,
"
stat
"
,
"
thermo
"
]
for
suffix
in
[
""
,
"
_zs-cot
"
,
"
_fs
"
,
"
_fs-cot
"
]:
subsets
=
[
f
"
scibench-
{
subset
}{
suffix
}
"
for
subset
in
scibench_tasks
]
scibench_tasks
=
[
'
atkins
'
,
'
calculus
'
,
'
chemmc
'
,
'
class
'
,
'
diff
'
,
'
fund
'
,
'
matter
'
,
'
quan
'
,
'
stat
'
,
'
thermo
'
]
for
suffix
in
[
''
,
'
_zs-cot
'
,
'
_fs
'
,
'
_fs-cot
'
]:
subsets
=
[
f
'
scibench-
{
subset
}{
suffix
}
'
for
subset
in
scibench_tasks
]
scibench_summary_groups
.
append
({
'name'
:
f
'scibench
{
suffix
}
'
,
'subsets'
:
subsets
})
configs/summarizers/groups/teval.py
View file @
aa2dd2b5
...
...
@@ -71,4 +71,3 @@ for group in _base_summary_groups:
group
[
'name'
]
=
group
[
'name'
]
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
]
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
teval_summary_groups
.
append
(
group
)
configs/summarizers/groups/xiezhi.py
View file @
aa2dd2b5
xiezhi_summary_groups
=
[]
_xiezhi
=
[
"
xiezhi-spec_eng
"
,
"
xiezhi-spec_chn
"
,
"
xiezhi-inter_eng
"
,
"
xiezhi-inter_chn
"
]
_xiezhi
=
[
'
xiezhi-spec_eng
'
,
'
xiezhi-spec_chn
'
,
'
xiezhi-inter_eng
'
,
'
xiezhi-inter_chn
'
]
xiezhi_summary_groups
.
append
({
'name'
:
'xiezhi'
,
'subsets'
:
_xiezhi
})
configs/summarizers/infinitebench.py
View file @
aa2dd2b5
...
...
@@ -2,7 +2,7 @@ from mmengine.config import read_base
with
read_base
():
from
.groups.infinitebench
import
infinitebench_summary_groups
summarizer
=
dict
(
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/internlm2_keyset.py
View file @
aa2dd2b5
...
...
@@ -16,5 +16,5 @@ summarizer = dict(
[
'sanitized_mbpp'
,
'score'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/lawbench.py
View file @
aa2dd2b5
...
...
@@ -50,7 +50,7 @@ summarizer = dict(
'lawbench-3-7-criminal_damages_calculation-1-shot'
,
'lawbench-3-8-consultation-1-shot'
,
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
prompt_db
=
dict
(
database_path
=
'configs/datasets/log.json'
,
config_dir
=
'configs/datasets'
,
...
...
configs/summarizers/leaderboard.py
View file @
aa2dd2b5
...
...
@@ -13,11 +13,11 @@ with read_base():
other_summary_groups
=
[]
other_summary_groups
.
append
({
'name'
:
'Exam'
,
'subsets'
:
[
"
ceval
"
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
"
GaokaoBench
"
,
'ARC-c'
,
'ARC-e'
]})
other_summary_groups
.
append
({
'name'
:
'Exam'
,
'subsets'
:
[
'
ceval
'
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
'
GaokaoBench
'
,
'ARC-c'
,
'ARC-e'
]})
other_summary_groups
.
append
({
'name'
:
'Language'
,
'subsets'
:
[
'WiC'
,
'chid-dev'
,
'afqmc-dev'
,
'WSC'
,
'tydiqa-goldp'
,
'flores_100'
]})
other_summary_groups
.
append
({
'name'
:
'Knowledge'
,
'subsets'
:
[
'BoolQ'
,
'commonsense_qa'
,
'triviaqa'
,
'nq'
]})
other_summary_groups
.
append
({
'name'
:
'Understanding'
,
'subsets'
:
[
'C3'
,
'race-middle'
,
'race-high'
,
'openbookqa_fact'
,
'csl_dev'
,
'lcsts'
,
'Xsum'
,
'eprstmt-dev'
,
'lambada'
]})
other_summary_groups
.
append
({
'name'
:
'Reasoning'
,
'subsets'
:
[
'cmnli'
,
'ocnli'
,
'AX_b'
,
'AX_g'
,
'RTE'
,
'COPA'
,
'ReCoRD'
,
'hellaswag'
,
'piqa'
,
'siqa'
,
'math'
,
'gsm8k'
,
'drop'
,
'openai_humaneval'
,
'mbpp'
,
"
bbh
"
]})
other_summary_groups
.
append
({
'name'
:
'Reasoning'
,
'subsets'
:
[
'cmnli'
,
'ocnli'
,
'AX_b'
,
'AX_g'
,
'RTE'
,
'COPA'
,
'ReCoRD'
,
'hellaswag'
,
'piqa'
,
'siqa'
,
'math'
,
'gsm8k'
,
'drop'
,
'openai_humaneval'
,
'mbpp'
,
'
bbh
'
]})
other_summary_groups
.
append
({
'name'
:
'Overall'
,
'subsets'
:
[
'Exam'
,
'Language'
,
'Knowledge'
,
'Understanding'
,
'Reasoning'
]})
summarizer
=
dict
(
...
...
@@ -30,11 +30,11 @@ summarizer = dict(
'Reasoning'
,
'--------- 考试 Exam ---------'
,
# category
# 'Mixed', # subcategory
"
ceval
"
,
'
ceval
'
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
"
GaokaoBench
"
,
'
GaokaoBench
'
,
'ARC-c'
,
'ARC-e'
,
'--------- 语言 Language ---------'
,
# category
...
...
@@ -92,8 +92,8 @@ summarizer = dict(
'openai_humaneval'
,
'mbpp'
,
# '综合推理', # subcategory
"
bbh
"
,
'
bbh
'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]),
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]),
)
configs/summarizers/lveval.py
View file @
aa2dd2b5
...
...
@@ -5,110 +5,110 @@ with read_base():
summarizer
=
dict
(
dataset_abbrs
=
[
"
----------------------------------------
"
,
"
--------- LVEval All ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_qa
"
,
"
----------------------------------------
"
,
"
--------- LVEval Tasks All ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_single_hop_qa
"
,
"
LVEval_single_hop_cqa
"
,
"
LVEval_multi_hop_qa
"
,
"
LVEval_multi_hop_cqa
"
,
"
LVEval_factrecall_cqa
"
,
"
----------------------------------------
"
,
"
--------- LVEval Datasets All ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_loogle_SD_mixup
"
,
"
LVEval_cmrc_mixup
"
,
"
LVEval_multifieldqa_en_mixup
"
,
"
LVEval_multifieldqa_zh_mixup
"
,
"
LVEval_dureader_mixup
"
,
"
LVEval_loogle_CR_mixup
"
,
"
LVEval_loogle_MIR_mixup
"
,
"
LVEval_hotpotwikiqa_mixup
"
,
"
LVEval_lic_mixup
"
,
"
LVEval_factrecall_en
"
,
"
LVEval_factrecall_zh
"
,
"
----------------------------------------
"
,
"
--------- LVEval Single_Hop QA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_loogle_SD_mixup_16k
"
,
"
LVEval_loogle_SD_mixup_32k
"
,
"
LVEval_loogle_SD_mixup_64k
"
,
"
LVEval_loogle_SD_mixup_128k
"
,
"
LVEval_loogle_SD_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_cmrc_mixup_16k
"
,
"
LVEval_cmrc_mixup_32k
"
,
"
LVEval_cmrc_mixup_64k
"
,
"
LVEval_cmrc_mixup_128k
"
,
"
LVEval_cmrc_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Single_Hop CQA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_multifieldqa_en_mixup_16k
"
,
"
LVEval_multifieldqa_en_mixup_32k
"
,
"
LVEval_multifieldqa_en_mixup_64k
"
,
"
LVEval_multifieldqa_en_mixup_128k
"
,
"
LVEval_multifieldqa_en_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_multifieldqa_zh_mixup_16k
"
,
"
LVEval_multifieldqa_zh_mixup_32k
"
,
"
LVEval_multifieldqa_zh_mixup_64k
"
,
"
LVEval_multifieldqa_zh_mixup_128k
"
,
"
LVEval_multifieldqa_zh_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Multi_Hop QA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_dureader_mixup_16k
"
,
"
LVEval_dureader_mixup_32k
"
,
"
LVEval_dureader_mixup_64k
"
,
"
LVEval_dureader_mixup_128k
"
,
"
LVEval_dureader_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_loogle_CR_mixup_16k
"
,
"
LVEval_loogle_CR_mixup_32k
"
,
"
LVEval_loogle_CR_mixup_64k
"
,
"
LVEval_loogle_CR_mixup_128k
"
,
"
LVEval_loogle_CR_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_loogle_MIR_mixup_16k
"
,
"
LVEval_loogle_MIR_mixup_32k
"
,
"
LVEval_loogle_MIR_mixup_64k
"
,
"
LVEval_loogle_MIR_mixup_128k
"
,
"
LVEval_loogle_MIR_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Multi_Hop CQA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_hotpotwikiqa_mixup_16k
"
,
"
LVEval_hotpotwikiqa_mixup_32k
"
,
"
LVEval_hotpotwikiqa_mixup_64k
"
,
"
LVEval_hotpotwikiqa_mixup_128k
"
,
"
LVEval_hotpotwikiqa_mixup_256k
"
,
"
----------------------------------------
"
,
"
LVEval_lic_mixup_16k
"
,
"
LVEval_lic_mixup_32k
"
,
"
LVEval_lic_mixup_64k
"
,
"
LVEval_lic_mixup_128k
"
,
"
LVEval_lic_mixup_256k
"
,
"
----------------------------------------
"
,
"
--------- LVEval Factrecall CQA ---------
"
,
# category
"
----------------------------------------
"
,
"
LVEval_factrecall_en_16k
"
,
"
LVEval_factrecall_en_32k
"
,
"
LVEval_factrecall_en_64k
"
,
"
LVEval_factrecall_en_128k
"
,
"
LVEval_factrecall_en_256k
"
,
"
----------------------------------------
"
,
"
LVEval_factrecall_zh_16k
"
,
"
LVEval_factrecall_zh_32k
"
,
"
LVEval_factrecall_zh_64k
"
,
"
LVEval_factrecall_zh_128k
"
,
"
LVEval_factrecall_zh_256k
"
,
'
----------------------------------------
'
,
'
--------- LVEval All ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_qa
'
,
'
----------------------------------------
'
,
'
--------- LVEval Tasks All ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_single_hop_qa
'
,
'
LVEval_single_hop_cqa
'
,
'
LVEval_multi_hop_qa
'
,
'
LVEval_multi_hop_cqa
'
,
'
LVEval_factrecall_cqa
'
,
'
----------------------------------------
'
,
'
--------- LVEval Datasets All ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_loogle_SD_mixup
'
,
'
LVEval_cmrc_mixup
'
,
'
LVEval_multifieldqa_en_mixup
'
,
'
LVEval_multifieldqa_zh_mixup
'
,
'
LVEval_dureader_mixup
'
,
'
LVEval_loogle_CR_mixup
'
,
'
LVEval_loogle_MIR_mixup
'
,
'
LVEval_hotpotwikiqa_mixup
'
,
'
LVEval_lic_mixup
'
,
'
LVEval_factrecall_en
'
,
'
LVEval_factrecall_zh
'
,
'
----------------------------------------
'
,
'
--------- LVEval Single_Hop QA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_loogle_SD_mixup_16k
'
,
'
LVEval_loogle_SD_mixup_32k
'
,
'
LVEval_loogle_SD_mixup_64k
'
,
'
LVEval_loogle_SD_mixup_128k
'
,
'
LVEval_loogle_SD_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_cmrc_mixup_16k
'
,
'
LVEval_cmrc_mixup_32k
'
,
'
LVEval_cmrc_mixup_64k
'
,
'
LVEval_cmrc_mixup_128k
'
,
'
LVEval_cmrc_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Single_Hop CQA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_multifieldqa_en_mixup_16k
'
,
'
LVEval_multifieldqa_en_mixup_32k
'
,
'
LVEval_multifieldqa_en_mixup_64k
'
,
'
LVEval_multifieldqa_en_mixup_128k
'
,
'
LVEval_multifieldqa_en_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_multifieldqa_zh_mixup_16k
'
,
'
LVEval_multifieldqa_zh_mixup_32k
'
,
'
LVEval_multifieldqa_zh_mixup_64k
'
,
'
LVEval_multifieldqa_zh_mixup_128k
'
,
'
LVEval_multifieldqa_zh_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Multi_Hop QA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_dureader_mixup_16k
'
,
'
LVEval_dureader_mixup_32k
'
,
'
LVEval_dureader_mixup_64k
'
,
'
LVEval_dureader_mixup_128k
'
,
'
LVEval_dureader_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_loogle_CR_mixup_16k
'
,
'
LVEval_loogle_CR_mixup_32k
'
,
'
LVEval_loogle_CR_mixup_64k
'
,
'
LVEval_loogle_CR_mixup_128k
'
,
'
LVEval_loogle_CR_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_loogle_MIR_mixup_16k
'
,
'
LVEval_loogle_MIR_mixup_32k
'
,
'
LVEval_loogle_MIR_mixup_64k
'
,
'
LVEval_loogle_MIR_mixup_128k
'
,
'
LVEval_loogle_MIR_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Multi_Hop CQA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_hotpotwikiqa_mixup_16k
'
,
'
LVEval_hotpotwikiqa_mixup_32k
'
,
'
LVEval_hotpotwikiqa_mixup_64k
'
,
'
LVEval_hotpotwikiqa_mixup_128k
'
,
'
LVEval_hotpotwikiqa_mixup_256k
'
,
'
----------------------------------------
'
,
'
LVEval_lic_mixup_16k
'
,
'
LVEval_lic_mixup_32k
'
,
'
LVEval_lic_mixup_64k
'
,
'
LVEval_lic_mixup_128k
'
,
'
LVEval_lic_mixup_256k
'
,
'
----------------------------------------
'
,
'
--------- LVEval Factrecall CQA ---------
'
,
# category
'
----------------------------------------
'
,
'
LVEval_factrecall_en_16k
'
,
'
LVEval_factrecall_en_32k
'
,
'
LVEval_factrecall_en_64k
'
,
'
LVEval_factrecall_en_128k
'
,
'
LVEval_factrecall_en_256k
'
,
'
----------------------------------------
'
,
'
LVEval_factrecall_zh_16k
'
,
'
LVEval_factrecall_zh_32k
'
,
'
LVEval_factrecall_zh_64k
'
,
'
LVEval_factrecall_zh_128k
'
,
'
LVEval_factrecall_zh_256k
'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[]
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[]
),
)
configs/summarizers/math_agent.py
View file @
aa2dd2b5
...
...
@@ -21,5 +21,5 @@ summarizer = dict(
'mathbench-circular-and-cloze-agent'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[])
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[])
)
configs/summarizers/math_baseline.py
View file @
aa2dd2b5
...
...
@@ -15,5 +15,5 @@ summarizer = dict(
'mathbench-circular-and-cloze'
,
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"
_summary_groups
"
)],
[])
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'
_summary_groups
'
)],
[])
)
Prev
1
…
28
29
30
31
32
33
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment