Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
16f29b25
Unverified
Commit
16f29b25
authored
Apr 07, 2024
by
Mo Li
Committed by
GitHub
Apr 07, 2024
Browse files
[Fix] Simplify needlebench summarizer (#1024)
* Conflicts: configs/summarizers/needlebench.py * fix lint problems
parent
f2af4933
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
505 additions
and
865 deletions
+505
-865
configs/datasets/needlebench/atc/atc_choice_50.py
configs/datasets/needlebench/atc/atc_choice_50.py
+43
-0
configs/datasets/needlebench/atc/atc_choice_80.py
configs/datasets/needlebench/atc/atc_choice_80.py
+43
-0
configs/summarizers/needlebench.py
configs/summarizers/needlebench.py
+188
-692
opencompass/summarizers/needlebench.py
opencompass/summarizers/needlebench.py
+231
-173
No files found.
configs/datasets/needlebench/atc/atc_choice_50.py
0 → 100644
View file @
16f29b25
from
mmengine.config
import
read_base
with
read_base
():
from
.atc_choice_20
import
*
needle_num_list
=
list
(
range
(
2
,
50
,
1
))
needlebench_datasets
=
[]
for
_name
in
list
(
single_choice_prompts
.
keys
()):
needlebench_atc_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
(
single_choice_prompts
[
_name
])),
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,),
)
needlebench_atc_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
CircularEvaluator
),
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
))
for
num_needles
in
needle_num_list
:
abbr
=
(
f
'NeedleBenchATCDataset-'
f
'
{
num_needles
}
Needle-
{
"EN"
if
"en"
in
_name
else
"ZH"
}
'
)
language
=
"English"
if
"en"
in
_name
else
"Chinese"
if
'reasoning'
in
_name
:
abbr
+=
'-Reasoning'
dataset_dict
=
{
'abbr'
:
abbr
,
'type'
:
NeedleBenchATCDataset
,
'path'
:
names_path
,
'num_needles'
:
num_needles
,
'language'
:
language
,
'repeats'
:
repeats
,
'with_circular'
:
with_circular_eval
,
'reader_cfg'
:
needlebench_atc_reader_cfg
,
'infer_cfg'
:
needlebench_atc_infer_cfg
,
'eval_cfg'
:
needlebench_atc_eval_cfg
}
needlebench_datasets
.
append
(
dataset_dict
)
configs/datasets/needlebench/atc/atc_choice_80.py
0 → 100644
View file @
16f29b25
from
mmengine.config
import
read_base
with
read_base
():
from
.atc_choice_20
import
*
needle_num_list
=
list
(
range
(
2
,
80
,
1
))
needlebench_datasets
=
[]
for
_name
in
list
(
single_choice_prompts
.
keys
()):
needlebench_atc_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
(
single_choice_prompts
[
_name
])),
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,),
)
needlebench_atc_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
CircularEvaluator
),
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
))
for
num_needles
in
needle_num_list
:
abbr
=
(
f
'NeedleBenchATCDataset-'
f
'
{
num_needles
}
Needle-
{
"EN"
if
"en"
in
_name
else
"ZH"
}
'
)
language
=
"English"
if
"en"
in
_name
else
"Chinese"
if
'reasoning'
in
_name
:
abbr
+=
'-Reasoning'
dataset_dict
=
{
'abbr'
:
abbr
,
'type'
:
NeedleBenchATCDataset
,
'path'
:
names_path
,
'num_needles'
:
num_needles
,
'language'
:
language
,
'repeats'
:
repeats
,
'with_circular'
:
with_circular_eval
,
'reader_cfg'
:
needlebench_atc_reader_cfg
,
'infer_cfg'
:
needlebench_atc_infer_cfg
,
'eval_cfg'
:
needlebench_atc_eval_cfg
}
needlebench_datasets
.
append
(
dataset_dict
)
configs/summarizers/needlebench.py
View file @
16f29b25
from
opencompass.summarizers.needlebench
import
NeedleBenchSummarizer
from
opencompass.summarizers.needlebench
import
NeedleBenchSummarizer
from
opencompass.summarizers.needlebench
import
NeedleBenchATCSummarizer
# ----------NeedleBench-4k-summarizer----------
context_lengths_4k
=
list
(
range
(
1000
,
5000
,
1000
))
depths
=
[
0
,
5
,
10
,
15
,
21
,
26
,
31
,
36
,
42
,
47
,
52
,
57
,
63
,
68
,
73
,
78
,
84
,
89
,
94
,
100
]
depths_list_sparse
=
[
0
,
10
,
21
,
31
,
42
,
52
,
63
,
73
,
84
,
94
,
100
]
# Initialize the lists
_needlebench_4k_2needle_en
=
[]
_needlebench_4k_3needle_en
=
[]
_needlebench_4k_4needle_en
=
[]
_needlebench_4k_5needle_en
=
[]
_needlebench_4k_2needle_zh
=
[]
_needlebench_4k_3needle_zh
=
[]
_needlebench_4k_4needle_zh
=
[]
_needlebench_4k_5needle_zh
=
[]
_needlebench_4k_origin_en
=
[]
_needlebench_4k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_4k
:
for
depth_percent
in
depths
:
_needlebench_4k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_4k'
)
_needlebench_4k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_4k'
)
_needlebench_4k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_4k'
)
_needlebench_4k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_4k'
)
_needlebench_4k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_4k'
)
_needlebench_4k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_4k'
)
_needlebench_4k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_4k'
)
_needlebench_4k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_4k'
)
_needlebench_4k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_4k'
)
_needlebench_4k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_4k'
)
# Concatenate the multi-needle and origin lists
_needlebench_4k_multi_needle_en
=
_needlebench_4k_2needle_en
+
_needlebench_4k_3needle_en
+
_needlebench_4k_4needle_en
+
_needlebench_4k_5needle_en
_needlebench_4k_multi_needle_zh
=
_needlebench_4k_2needle_zh
+
_needlebench_4k_3needle_zh
+
_needlebench_4k_4needle_zh
+
_needlebench_4k_5needle_zh
_needlebench_4k_origin
=
_needlebench_4k_origin_en
+
_needlebench_4k_origin_zh
_needlebench_4k_multi_needle
=
_needlebench_4k_multi_needle_en
+
_needlebench_4k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_4k_parallel_en
=
[]
_needlebench_4k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_4k
:
_needlebench_4k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_4k'
)
for
original_context_length
in
context_lengths_4k
:
_needlebench_4k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_4k'
)
_needlebench_4k_parallel
=
_needlebench_4k_parallel_en
+
_needlebench_4k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_4k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_4k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_4k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_4k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_4k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_4k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_4k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_4k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_4k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_4k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_4k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_4k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_4k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_4k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_4k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_4k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_4k_parallel_en
},
def
create_m_rs_names_list
(
context_lengths
,
depths
,
needle_counts
,
languages
,
dataset_size
):
names_dict
=
{}
multi_needle_list
=
[]
multi_needle_en_list
=
[]
multi_needle_zh_list
=
[]
for
needle_count
in
needle_counts
:
for
language
in
languages
:
key
=
f
"
{
needle_count
}
-Needle-
{
language
.
upper
()
}
-
{
dataset_size
.
upper
()
}
"
names_list
=
[
f
"Length
{
length
}
Depth
{
int
(
depth
)
}
_
{
needle_count
}
needle_
{
language
}
_
{
dataset_size
}
"
for
length
in
context_lengths
for
depth
in
depths
]
names_dict
[
key
]
=
names_list
multi_needle_list
.
extend
(
names_list
)
if
language
==
'en'
:
multi_needle_en_list
.
extend
(
names_list
)
elif
language
==
'zh'
:
multi_needle_zh_list
.
extend
(
names_list
)
names_dict
[
'Multi-Needle-Reasoning(M-RS)'
]
=
multi_needle_list
names_dict
[
'Multi-Needle-Reasoning-EN'
]
=
multi_needle_en_list
names_dict
[
'Multi-Needle-Reasoning-ZH'
]
=
multi_needle_zh_list
return
names_dict
def
create_summarizer
(
context_lengths
,
depths
,
dataset_size
,
sparse_depths
=
None
):
needle_counts
=
[
"2"
,
"3"
,
"4"
,
"5"
]
languages
=
[
"en"
,
"zh"
]
if
sparse_depths
:
depths
=
sparse_depths
names_dict
=
{}
multi_reasoning_names
=
create_m_rs_names_list
(
context_lengths
,
depths
,
needle_counts
,
languages
,
dataset_size
)
names_dict
.
update
(
multi_reasoning_names
)
single_needle_list
=
[]
single_needle_en_list
=
[]
single_needle_zh_list
=
[]
for
language
in
languages
:
names_list
=
[
f
"Length
{
length
}
Depth
{
int
(
depth
)
}
_origin_
{
language
}
_
{
dataset_size
}
"
for
length
in
context_lengths
for
depth
in
depths
]
single_needle_list
.
extend
(
names_list
)
if
language
==
'en'
:
single_needle_en_list
.
extend
(
names_list
)
elif
language
==
'zh'
:
single_needle_zh_list
.
extend
(
names_list
)
names_dict
[
'Single-Needle-Retrieval(S-RT)'
]
=
single_needle_list
names_dict
[
'Single-Needle-Retrieval-EN'
]
=
single_needle_en_list
names_dict
[
'Single-Needle-Retrieval-ZH'
]
=
single_needle_zh_list
parallel_list
=
[]
parallel_en_list
=
[]
parallel_zh_list
=
[]
for
language
in
languages
:
names_list
=
[
f
"Length
{
length
}
_parallel_
{
language
}
_
{
dataset_size
}
"
for
length
in
context_lengths
]
parallel_list
.
extend
(
names_list
)
if
language
==
'en'
:
parallel_en_list
.
extend
(
names_list
)
elif
language
==
'zh'
:
parallel_zh_list
.
extend
(
names_list
)
names_dict
[
'Multi-Needle-Retrieval(M-RT)'
]
=
parallel_list
names_dict
[
'Multi-Needle-Retrieval-EN'
]
=
parallel_en_list
names_dict
[
'Multi-Needle-Retrieval-ZH'
]
=
parallel_zh_list
summary_groups
=
[
{
'name'
:
key
,
'subsets'
:
value
}
for
key
,
value
in
names_dict
.
items
()
]
summary_groups
.
append
({
'name'
:
'NeedleBench-Overall-Score'
,
'subsets'
:
[[
'Single-Needle-Retrieval(S-RT)'
,
'naive_average'
],
[
'Multi-Needle-Reasoning(M-RS)'
,
'naive_average'
],
[
'Multi-Needle-Retrieval(M-RT)'
,
'average_score'
]],
'weights'
:
{
'Single-Needle-Retrieval(S-RT)'
:
0.4
,
'Multi-Needle-Reasoning(M-RS)'
:
0.3
,
'Multi-Needle-Retrieval(M-RT)'
:
0.3
}})
summarizer_config
=
{
'type'
:
NeedleBenchSummarizer
,
'summary_groups'
:
summary_groups
,
'dataset_abbrs'
:
[
'NeedleBench-Overall-Score'
,
f
'--------- NeedleBench-
{
dataset_size
.
upper
()
}
-Single-Needle-Retrieval ---------'
,
'Single-Needle-Retrieval(S-RT)'
,
'Single-Needle-Retrieval-EN'
,
'Single-Needle-Retrieval-ZH'
,
f
'--------- NeedleBench-
{
dataset_size
.
upper
()
}
-Multi-Needle-Retrieval ---------'
,
'Multi-Needle-Retrieval(M-RT)'
,
'Multi-Needle-Retrieval-EN'
,
'Multi-Needle-Retrieval-ZH'
,
f
'--------- NeedleBench-
{
dataset_size
.
upper
()
}
-Multi-Needle-Reasoning ---------'
,
'Multi-Needle-Reasoning(M-RS)'
,
'Multi-Needle-Reasoning-EN'
,
'Multi-Needle-Reasoning-ZH'
,
'2-Needle-EN-4K'
,
'2-Needle-ZH-4K'
,
'3-Needle-EN-4K'
,
'3-Needle-ZH-4K'
,
'4-Needle-EN-4K'
,
'4-Needle-ZH-4K'
,
'5-Needle-EN-4K'
,
'5-Needle-ZH-4K'
,
]
}
return
summarizer_config
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_4k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-4k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-4k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-4k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-8k-summarizer----------
depths
=
[
0
,
5
,
10
,
15
,
21
,
26
,
31
,
36
,
42
,
47
,
52
,
57
,
63
,
68
,
73
,
78
,
84
,
89
,
94
,
100
]
depths_list_sparse
=
[
0
,
10
,
21
,
31
,
42
,
52
,
63
,
73
,
84
,
94
,
100
]
context_lengths_4k
=
list
(
range
(
1000
,
5000
,
1000
))
needlebench_4k_summarizer
=
create_summarizer
(
context_lengths_4k
,
depths
,
"4k"
)
context_lengths_8k
=
list
(
range
(
5000
,
9000
,
1000
))
context_lengths_8k
=
list
(
range
(
5000
,
9000
,
1000
))
needlebench_8k_summarizer
=
create_summarizer
(
context_lengths_8k
,
depths
,
"8k"
)
# Initialize the lists
_needlebench_8k_2needle_en
=
[]
_needlebench_8k_3needle_en
=
[]
_needlebench_8k_4needle_en
=
[]
_needlebench_8k_5needle_en
=
[]
_needlebench_8k_2needle_zh
=
[]
_needlebench_8k_3needle_zh
=
[]
_needlebench_8k_4needle_zh
=
[]
_needlebench_8k_5needle_zh
=
[]
_needlebench_8k_origin_en
=
[]
_needlebench_8k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_8k
:
for
depth_percent
in
depths
:
_needlebench_8k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_8k'
)
_needlebench_8k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_8k'
)
_needlebench_8k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_8k'
)
_needlebench_8k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_8k'
)
_needlebench_8k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_8k'
)
_needlebench_8k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_8k'
)
_needlebench_8k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_8k'
)
_needlebench_8k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_8k'
)
_needlebench_8k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_8k'
)
_needlebench_8k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_8k'
)
# Concatenate the multi-needle and origin lists
_needlebench_8k_multi_needle_en
=
_needlebench_8k_2needle_en
+
_needlebench_8k_3needle_en
+
_needlebench_8k_4needle_en
+
_needlebench_8k_5needle_en
_needlebench_8k_multi_needle_zh
=
_needlebench_8k_2needle_zh
+
_needlebench_8k_3needle_zh
+
_needlebench_8k_4needle_zh
+
_needlebench_8k_5needle_zh
_needlebench_8k_origin
=
_needlebench_8k_origin_en
+
_needlebench_8k_origin_zh
_needlebench_8k_multi_needle
=
_needlebench_8k_multi_needle_en
+
_needlebench_8k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en
=
[]
_needlebench_8k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_8k
:
_needlebench_8k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_8k'
)
for
original_context_length
in
context_lengths_8k
:
_needlebench_8k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_8k'
)
_needlebench_8k_parallel
=
_needlebench_8k_parallel_en
+
_needlebench_8k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_8k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_8k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_8k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_8k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_8k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_8k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_8k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_8k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_8k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_8k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_8k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_8k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_8k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_8k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_8k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_8k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_8k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_8k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-8k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-8k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-8k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-32k-summarizer----------
context_lengths_32k
=
[
9000
,
13000
,
17000
,
21000
,
25000
,
29000
,
31000
,
32000
]
context_lengths_32k
=
[
9000
,
13000
,
17000
,
21000
,
25000
,
29000
,
31000
,
32000
]
needlebench_32k_summarizer
=
create_summarizer
(
context_lengths_32k
,
depths_list_sparse
,
"32k"
)
# Initialize the lists
_needlebench_32k_2needle_en
=
[]
_needlebench_32k_3needle_en
=
[]
_needlebench_32k_4needle_en
=
[]
_needlebench_32k_5needle_en
=
[]
_needlebench_32k_2needle_zh
=
[]
_needlebench_32k_3needle_zh
=
[]
_needlebench_32k_4needle_zh
=
[]
_needlebench_32k_5needle_zh
=
[]
_needlebench_32k_origin_en
=
[]
_needlebench_32k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_32k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_32k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_32k'
)
_needlebench_32k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_32k'
)
_needlebench_32k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_32k'
)
_needlebench_32k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_32k'
)
_needlebench_32k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_32k'
)
_needlebench_32k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_32k'
)
_needlebench_32k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_32k'
)
_needlebench_32k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_32k'
)
_needlebench_32k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_32k'
)
_needlebench_32k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_32k'
)
# Concatenate the multi-needle and origin lists
_needlebench_32k_multi_needle_en
=
_needlebench_32k_2needle_en
+
_needlebench_32k_3needle_en
+
_needlebench_32k_4needle_en
+
_needlebench_32k_5needle_en
_needlebench_32k_multi_needle_zh
=
_needlebench_32k_2needle_zh
+
_needlebench_32k_3needle_zh
+
_needlebench_32k_4needle_zh
+
_needlebench_32k_5needle_zh
_needlebench_32k_origin
=
_needlebench_32k_origin_en
+
_needlebench_32k_origin_zh
_needlebench_32k_multi_needle
=
_needlebench_32k_multi_needle_en
+
_needlebench_32k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_32k_parallel_en
=
[]
_needlebench_32k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_32k
:
_needlebench_32k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_32k'
)
for
original_context_length
in
context_lengths_32k
:
_needlebench_32k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_32k'
)
_needlebench_32k_parallel
=
_needlebench_32k_parallel_en
+
_needlebench_32k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_32k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_32k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_32k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_32k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_32k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_32k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_32k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_32k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_32k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_32k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_32k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_32k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_32k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_32k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_32k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_32k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_32k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_32k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-32k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-32k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-32k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-128k-summarizer----------
context_lengths_128k
=
list
([
16000
,
32000
,
48000
,
64000
,
80000
,
96000
,
112000
,
128000
])
context_lengths_128k
=
list
([
16000
,
32000
,
48000
,
64000
,
80000
,
96000
,
112000
,
128000
])
needlebench_128k_summarizer
=
create_summarizer
(
context_lengths_128k
,
depths_list_sparse
,
"128k"
)
# Initialize the lists
_needlebench_128k_2needle_en
=
[]
_needlebench_128k_3needle_en
=
[]
_needlebench_128k_4needle_en
=
[]
_needlebench_128k_5needle_en
=
[]
_needlebench_128k_2needle_zh
=
[]
_needlebench_128k_3needle_zh
=
[]
_needlebench_128k_4needle_zh
=
[]
_needlebench_128k_5needle_zh
=
[]
_needlebench_128k_origin_en
=
[]
_needlebench_128k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_128k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_128k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_128k'
)
_needlebench_128k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_128k'
)
_needlebench_128k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_128k'
)
_needlebench_128k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_128k'
)
_needlebench_128k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_128k'
)
_needlebench_128k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_128k'
)
_needlebench_128k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_128k'
)
_needlebench_128k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_128k'
)
_needlebench_128k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_128k'
)
_needlebench_128k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_128k'
)
# Concatenate the multi-needle and origin lists
_needlebench_128k_multi_needle_en
=
_needlebench_128k_2needle_en
+
_needlebench_128k_3needle_en
+
_needlebench_128k_4needle_en
+
_needlebench_128k_5needle_en
_needlebench_128k_multi_needle_zh
=
_needlebench_128k_2needle_zh
+
_needlebench_128k_3needle_zh
+
_needlebench_128k_4needle_zh
+
_needlebench_128k_5needle_zh
_needlebench_128k_origin
=
_needlebench_128k_origin_en
+
_needlebench_128k_origin_zh
_needlebench_128k_multi_needle
=
_needlebench_128k_multi_needle_en
+
_needlebench_128k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_128k_parallel_en
=
[]
_needlebench_128k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_128k
:
_needlebench_128k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_128k'
)
for
original_context_length
in
context_lengths_128k
:
_needlebench_128k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_128k'
)
_needlebench_128k_parallel
=
_needlebench_128k_parallel_en
+
_needlebench_128k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_128k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_128k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_128k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_128k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_128k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_128k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_128k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_128k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_128k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_128k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_128k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_128k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_128k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_128k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_128k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_128k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_128k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_128k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-128k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-128k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-128k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-200k-summarizer----------
context_lengths_200k
=
list
([
16000
,
48000
,
80000
,
112000
,
128000
,
144000
,
176000
,
200000
])
context_lengths_200k
=
list
([
16000
,
48000
,
80000
,
112000
,
128000
,
144000
,
176000
,
200000
])
# Initialize the lists
needlebench_200k_summarizer
=
create_summarizer
(
context_lengths_200k
,
depths_list_sparse
,
"200k"
)
_needlebench_200k_2needle_en
=
[]
_needlebench_200k_3needle_en
=
[]
_needlebench_200k_4needle_en
=
[]
_needlebench_200k_5needle_en
=
[]
_needlebench_200k_2needle_zh
=
[]
_needlebench_200k_3needle_zh
=
[]
_needlebench_200k_4needle_zh
=
[]
_needlebench_200k_5needle_zh
=
[]
_needlebench_200k_origin_en
=
[]
_needlebench_200k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_200k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_200k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_200k'
)
_needlebench_200k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_200k'
)
_needlebench_200k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_200k'
)
_needlebench_200k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_200k'
)
_needlebench_200k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_200k'
)
_needlebench_200k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_200k'
)
_needlebench_200k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_200k'
)
_needlebench_200k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_200k'
)
_needlebench_200k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_200k'
)
_needlebench_200k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_200k'
)
# Concatenate the multi-needle and origin lists
_needlebench_200k_multi_needle_en
=
_needlebench_200k_2needle_en
+
_needlebench_200k_3needle_en
+
_needlebench_200k_4needle_en
+
_needlebench_200k_5needle_en
_needlebench_200k_multi_needle_zh
=
_needlebench_200k_2needle_zh
+
_needlebench_200k_3needle_zh
+
_needlebench_200k_4needle_zh
+
_needlebench_200k_5needle_zh
_needlebench_200k_origin
=
_needlebench_200k_origin_en
+
_needlebench_200k_origin_zh
_needlebench_200k_multi_needle
=
_needlebench_200k_multi_needle_en
+
_needlebench_200k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_200k_parallel_en
=
[]
_needlebench_200k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_200k
:
_needlebench_200k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_200k'
)
for
original_context_length
in
context_lengths_200k
:
_needlebench_200k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_200k'
)
_needlebench_200k_parallel
=
_needlebench_200k_parallel_en
+
_needlebench_200k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_200k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_200k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_200k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_200k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_200k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_200k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_200k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_200k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_200k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_200k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_200k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_200k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_200k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_200k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_200k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_200k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_200k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_200k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-200k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-200k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-200k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-1000k-summarizer----------
context_lengths_1000k
=
list
([
20000
,
160000
,
300000
,
440000
,
580000
,
720000
,
860000
,
1000000
])
context_lengths_1000k
=
list
([
20000
,
160000
,
300000
,
440000
,
580000
,
720000
,
860000
,
1000000
])
# Initialize the lists
needlebench_1000k_summarizer
=
create_summarizer
(
context_lengths_1000k
,
depths_list_sparse
,
"1000k"
)
_needlebench_1000k_2needle_en
=
[]
_needlebench_1000k_3needle_en
=
[]
_needlebench_1000k_4needle_en
=
[]
_needlebench_1000k_5needle_en
=
[]
_needlebench_1000k_2needle_zh
=
[]
_needlebench_1000k_3needle_zh
=
[]
_needlebench_1000k_4needle_zh
=
[]
_needlebench_1000k_5needle_zh
=
[]
_needlebench_1000k_origin_en
=
[]
_needlebench_1000k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_1000k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_1000k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_1000k'
)
_needlebench_1000k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_1000k'
)
_needlebench_1000k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_1000k'
)
_needlebench_1000k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_1000k'
)
_needlebench_1000k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_1000k'
)
_needlebench_1000k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_1000k'
)
_needlebench_1000k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_1000k'
)
_needlebench_1000k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_1000k'
)
_needlebench_1000k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_1000k'
)
_needlebench_1000k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_1000k'
)
# Concatenate the multi-needle and origin lists
_needlebench_1000k_multi_needle_en
=
_needlebench_1000k_2needle_en
+
_needlebench_1000k_3needle_en
+
_needlebench_1000k_4needle_en
+
_needlebench_1000k_5needle_en
_needlebench_1000k_multi_needle_zh
=
_needlebench_1000k_2needle_zh
+
_needlebench_1000k_3needle_zh
+
_needlebench_1000k_4needle_zh
+
_needlebench_1000k_5needle_zh
_needlebench_1000k_origin
=
_needlebench_1000k_origin_en
+
_needlebench_1000k_origin_zh
_needlebench_1000k_multi_needle
=
_needlebench_1000k_multi_needle_en
+
_needlebench_1000k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_1000k_parallel_en
=
[]
_needlebench_1000k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_1000k
:
_needlebench_1000k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_1000k'
)
for
original_context_length
in
context_lengths_1000k
:
_needlebench_1000k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_1000k'
)
_needlebench_1000k_parallel
=
_needlebench_1000k_parallel_en
+
_needlebench_1000k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_1000k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_1000k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_1000k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_1000k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_1000k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_1000k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_1000k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_1000k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_1000k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_1000k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_1000k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_1000k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_1000k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_1000k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_1000k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_1000k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_1000k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_1000k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-1000k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-1000k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-1000k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
context_lengths_8k
=
list
(
range
(
5000
,
9000
,
1000
))
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en_batch1
=
[]
_needlebench_8k_parallel_en_batch1
=
[]
_needlebench_8k_parallel_en_batch5
=
[]
_needlebench_8k_parallel_en_batch5
=
[]
_needlebench_8k_parallel_en_batch10
=
[]
_needlebench_8k_parallel_en_batch10
=
[]
...
@@ -713,7 +202,6 @@ needlebench_8k_batch_overall_summarizer = dict(
...
@@ -713,7 +202,6 @@ needlebench_8k_batch_overall_summarizer = dict(
'parallel_version_en_batch15'
,
'parallel_version_en_batch15'
,
'parallel_version_zh_batch20'
,
'parallel_version_zh_batch20'
,
'parallel_version_en_batch20'
,
'parallel_version_en_batch20'
,
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
],
summary_groups
=
needlebench_summary_groups
,
summary_groups
=
needlebench_summary_groups
,
)
)
...
@@ -754,64 +242,72 @@ needlebench_8k_batch_depth0_summarizer = dict(
...
@@ -754,64 +242,72 @@ needlebench_8k_batch_depth0_summarizer = dict(
'parallel_version_en_batch15'
,
'parallel_version_en_batch15'
,
'parallel_version_zh_batch20'
,
'parallel_version_zh_batch20'
,
'parallel_version_en_batch20'
,
'parallel_version_en_batch20'
,
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
],
summary_groups
=
needlebench_summary_groups
,
summary_groups
=
needlebench_summary_groups
,
)
)
needle_num_list
=
list
(
range
(
2
,
20
,
1
))
def
gen_atc_summarizer
(
needle_num_list
):
categories
=
[
categories
=
[
'ZH'
,
'EN'
,
'ZH-Reasoning'
,
'EN-Reasoning'
,
'ZH-CircularEval'
,
'EN-CircularEval'
,
'ZH-Reasoning-Circular'
,
'EN-Reasoning-Circular'
]
'ZH-Direct-CE'
,
'EN-Direct-CE'
,
needlebench_atc_summary_groups
=
[]
'ZH-Reasoning-CE'
,
'EN-Reasoning-CE'
]
for
category
in
categories
:
needlebench_atc_summary_groups
=
[]
metric
=
'perf_4'
if
'CircularEval'
in
category
else
'acc_1'
cleaned_category
=
category
.
replace
(
'-CircularEval'
,
''
).
replace
(
'-Circular'
,
''
)
# 根据分类生成summary groups
subsets
=
[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
for
category
in
categories
:
for
num_needles
in
needle_num_list
]
# 对于CircularEval相关的评分,使用perf_4指标,否则使用acc_1指标
metric
=
'perf_4'
if
'CE'
in
category
else
'acc_1'
# 生成subsets时,不需要在数据集名称中包含CircularEval信息
cleaned_category
=
category
.
replace
(
'-CE'
,
''
).
replace
(
'-Direct'
,
''
)
needlebench_atc_summary_groups
.
append
({
'name'
:
category
,
'subsets'
:
[
[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
,
metric
]
for
num_needles
in
needle_num_list
],
'weights'
:
{
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
:
num_needles
for
num_needles
in
needle_num_list
},
})
needlebench_atc_summary_groups
.
append
({
needlebench_atc_summary_groups
.
append
({
'name'
:
category
,
'name'
:
'ATC-CE-Overall'
,
'subsets'
:
[
'subsets'
:
[
[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
,
[
f
'
{
category
}
'
,
'weighted_average'
]
for
category
in
categories
metric
]
],
for
num_needles
in
needle_num_list
})
]
atc_dataset_abbrs
=
[]
})
atc_dataset_abbrs
.
append
([
'ATC-CE-Overall'
,
'naive_average'
])
atc_dataset_abbrs
=
[]
for
category
in
categories
:
weighted_average_score_entry
=
[
f
'
{
category
}
'
,
'weighted_average'
]
for
category
in
categories
:
atc_dataset_abbrs
.
append
(
weighted_average_score_entry
)
title
=
f
'######## Needlebench-ATC-
{
category
}
-Score ########'
atc_dataset_abbrs
.
append
(
title
)
needlebench_atc_summarizer
=
dict
(
dataset_abbrs
=
[
weighted_average_score_entry
=
[
f
'
{
category
}
'
,
'weighted_average'
]
*
atc_dataset_abbrs
,
atc_dataset_abbrs
.
append
(
weighted_average_score_entry
)
'######## Needlebench-ATC Accuracy ########'
,
# category
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
if
atc_dataset_abbrs
[
-
1
]
==
'------------------------------------------'
:
'------------------------------------------'
,
atc_dataset_abbrs
.
pop
()
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
needlebench_atc_summarizer
=
dict
(
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
dataset_abbrs
=
[
'------------------------------------------'
,
*
atc_dataset_abbrs
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'######## Needlebench-ATC Accuracy ########'
,
# category
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'######## Needlebench-ATC CircularEval ########'
,
# category
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'######## Needlebench-ATC CircularEval ########'
,
# category
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
],
'------------------------------------------'
,
summary_groups
=
needlebench_atc_summary_groups
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
)
'------------------------------------------'
,
return
needlebench_atc_summarizer
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
atc_summarizer_20
=
gen_atc_summarizer
(
list
(
range
(
2
,
20
,
1
)))
'------------------------------------------'
,
atc_summarizer_50
=
gen_atc_summarizer
(
list
(
range
(
2
,
50
,
1
)))
],
atc_summarizer_80
=
gen_atc_summarizer
(
list
(
range
(
2
,
80
,
1
)))
summary_groups
=
needlebench_atc_summary_groups
)
opencompass/summarizers/needlebench.py
View file @
16f29b25
...
@@ -5,6 +5,7 @@ import getpass
...
@@ -5,6 +5,7 @@ import getpass
import
math
import
math
import
os
import
os
import
os.path
as
osp
import
os.path
as
osp
import
shutil
from
datetime
import
datetime
from
datetime
import
datetime
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Dict
,
List
,
Optional
...
@@ -26,6 +27,92 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
...
@@ -26,6 +27,92 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
model_abbr_from_cfg
)
model_abbr_from_cfg
)
from
opencompass.utils.prompt
import
get_prompt_hash
from
opencompass.utils.prompt
import
get_prompt_hash
model_name_mapping
=
{
'llama-2-7b-chat-hf'
:
'LLaMA-2-7B'
,
'llama-2-13b-chat-hf'
:
'LLaMA-2-13B'
,
'llama-2-70b-chat-hf'
:
'LLaMA-2-70B'
,
'baichuan2-7b-chat-hf'
:
'Baichuan2-7B'
,
'baichuan2-13b-chat-hf'
:
'Baichuan2-13B'
,
'yi-6b-chat-hf'
:
'Yi-6B'
,
'yi-34b-chat-hf'
:
'Yi-34B'
,
'deepseek-67b-chat-hf'
:
'DeepSeek-67B'
,
'wizardlm-70b-v1.0-vllm'
:
'WizardLM-70B'
,
'qwen-14b-chat-hf'
:
'Qwen-14B'
,
'qwen-72b-chat-hf'
:
'Qwen-72B'
,
'qwen-72b-chat-vllm'
:
'Qwen-72B-vLLM'
,
'internlm2-chat-7b-turbomind'
:
'InternLM2-7B-200K'
,
'internlm2-chat-20b-turbomind'
:
'InternLM2-20B-200K'
,
'internlm2-chat-7b-hf'
:
'InternLM2-7B'
,
'internlm2-chat-20b-hf'
:
'InternLM2-20B'
,
'qwen-7b-chat-hf'
:
'Qwen-7B'
,
'chatglm3-6b-hf'
:
'ChatGLM3-6B'
,
'chatglm3-6b-32k-hf'
:
'ChatGLM3-6B-32K'
,
'zephyr-7b-beta-vllm'
:
'Zephyr-7B Beta'
,
'mistral-7b-instruct-v0.2-vllm'
:
'Mistral-7B Inst. v0.2'
,
'mistral-7b-instruct-v0.1-vllm'
:
'Mistral-7B Inst. v0.1'
,
'mixtral-8x7b-instruct-v0.1-vllm'
:
'Mixtral-8x7B Inst. v0.1'
,
'orionstar-yi-34b-chat-hf'
:
'OrionStar-Yi-34B'
,
'orionstar-14b-long-chat-vllm'
:
'Orion-14B-LongChat'
,
'internlm-chat-7b-hf'
:
'InternLM-7B'
,
'gemma-2b-it-hf'
:
'Gemma-2B'
,
'gemma-7b-it-hf'
:
'Gemma-7B'
,
'qwen1.5-0.5b-chat-hf'
:
'Qwen-1.5-0.5B'
,
'qwen1.5-1.8b-chat-hf'
:
'Qwen-1.5-1.8B'
,
'qwen1.5-4b-chat-hf'
:
'Qwen-1.5-4B'
,
'qwen1.5-14b-chat-hf'
:
'Qwen-1.5-14B'
,
'qwen1.5-72b-chat-hf'
:
'Qwen-1.5-72B'
,
'qwen1.5-14b-chat-vllm'
:
'Qwen-1.5-14B-vLLM'
,
'qwen1.5-72b-chat-vllm'
:
'Qwen-1.5-72B-vLLM'
,
'glm4_notools'
:
'GLM-4'
,
'claude-3-opus'
:
'Claude-3-Opus'
,
# Add more mappings as necessary
}
dataset_mapping_dict
=
{}
needle_counts
=
[
'2'
,
'3'
,
'4'
,
'5'
]
languages
=
[
'en'
,
'zh'
]
sizes
=
[
'4k'
,
'8k'
,
'32k'
,
'200k'
,
'1000k'
]
types
=
[
'origin'
,
'parallel'
]
for
needle_count
in
needle_counts
:
for
language
in
languages
:
for
size
in
sizes
:
key
=
f
'
{
needle_count
}
needle_
{
language
}
_
{
size
}
'
value
=
f
'
{
needle_count
}
-Needle-Reasoning-
{
language
.
upper
()
}
-
{
size
.
upper
()
}
'
dataset_mapping_dict
[
key
]
=
value
for
t
in
types
:
for
language
in
languages
:
for
size
in
sizes
:
if
t
==
'origin'
:
key
=
f
'
{
t
}
_
{
language
}
_
{
size
}
'
value
=
f
'Single-Needle-Retrieval-
{
language
.
upper
()
}
-
{
size
.
upper
()
}
'
elif
t
==
'parallel'
:
key
=
f
'
{
t
}
_
{
language
}
_
{
size
}
'
value
=
f
'Multi-Needle-Retrieval-
{
language
.
upper
()
}
-
{
size
.
upper
()
}
'
dataset_mapping_dict
[
key
]
=
value
def
calculate_elementwise_average
(
model_name
,
merged_df
):
score_columns
=
[
col
for
col
in
merged_df
.
columns
if
col
!=
'dataset'
]
origin_columns
=
[
col
for
col
in
score_columns
if
'origin'
in
col
]
parallel_columns
=
[
col
for
col
in
score_columns
if
'parallel'
in
col
]
multi_columns
=
[
col
for
col
in
score_columns
if
'needle'
in
col
]
if
origin_columns
and
parallel_columns
and
multi_columns
:
origin_avg
=
merged_df
[
origin_columns
].
mean
(
axis
=
1
)
*
0.4
parallel_avg
=
merged_df
[
parallel_columns
].
mean
(
axis
=
1
)
*
0.3
multi_avg
=
merged_df
[
multi_columns
].
mean
(
axis
=
1
)
*
0.3
merged_df
[
model_name
]
=
origin_avg
+
parallel_avg
+
multi_avg
else
:
relevant_columns
=
origin_columns
or
parallel_columns
or
multi_columns
if
relevant_columns
:
merged_df
[
model_name
]
=
merged_df
[
relevant_columns
].
mean
(
axis
=
1
)
else
:
merged_df
[
model_name
]
=
pd
.
Series
([
0
]
*
len
(
merged_df
))
return
merged_df
.
iloc
[:,
[
0
,
-
1
]]
def
read_after_specific_line_except_last
(
file_name
,
keyword
,
offset
):
def
read_after_specific_line_except_last
(
file_name
,
keyword
,
offset
):
with
open
(
file_name
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
with
open
(
file_name
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
...
@@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False
...
@@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False
df
=
pd
.
DataFrame
(
data
,
columns
=
[
'dataset'
,
model_name
])
df
=
pd
.
DataFrame
(
data
,
columns
=
[
'dataset'
,
model_name
])
return
df
return
df
def
convert_to_k
(
value
):
try
:
return
f
'
{
int
(
value
)
//
1000
}
k'
except
ValueError
:
return
value
def
parse_model_scores
(
text
):
def
parse_model_scores
(
text
):
lines
=
text
.
split
(
'
\n
'
)
lines
=
text
.
split
(
'
\n
'
)
...
@@ -82,8 +175,86 @@ def parse_model_scores(text):
...
@@ -82,8 +175,86 @@ def parse_model_scores(text):
return
result_dict
return
result_dict
def
remove_empty_subfolders
(
plot_path
):
for
folder_name
in
tqdm
(
os
.
listdir
(
plot_path
),
desc
=
'Deleting Empty folders'
):
folder_path
=
os
.
path
.
join
(
plot_path
,
folder_name
)
if
os
.
path
.
isdir
(
folder_path
):
if
not
os
.
listdir
(
folder_path
):
shutil
.
rmtree
(
folder_path
)
def
save_results_to_plots
(
txt_results_save_path
):
content
=
read_after_specific_line_except_last
(
txt_results_save_path
,
'raw format'
,
2
)
parsed_data
=
parse_model_scores
(
content
)
model_names
=
get_dict_model_names
(
parsed_data
)
numbers
=
[
2
,
3
,
4
,
5
]
languages
=
[
'en'
,
'zh'
]
size_exists
=
[]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
,
'_1000k'
]
for
size
in
sizes_origin
:
if
size
in
content
:
size_exists
.
append
(
size
)
multi_dataset_abbrs
=
[
f
'
{
num
}
needle_
{
lang
}{
size
}
'
for
num
in
numbers
for
lang
in
languages
for
size
in
size_exists
]
origin_dataset_abbrs
=
[
f
'origin_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
parallel_dataset_abbrs
=
[
f
'parallel_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
dataset_abbrs
=
multi_dataset_abbrs
+
origin_dataset_abbrs
+
\
parallel_dataset_abbrs
base_path
=
os
.
path
.
dirname
(
txt_results_save_path
)
plot_path
=
os
.
path
.
join
(
base_path
,
'plots'
)
model_scores
=
{}
for
model_name
in
tqdm
(
model_names
):
model_datasets_scores
=
{}
# Dictionary to store scores for each dataset for the current model
for
dataset_abbr
in
dataset_abbrs
:
parallel_flag
=
'parallel'
in
dataset_abbr
folder_path
=
os
.
path
.
join
(
plot_path
,
dataset_mapping_dict
[
dataset_abbr
])
ensure_directory
(
folder_path
)
save_path
=
os
.
path
.
join
(
folder_path
,
f
'
{
model_name
}
.png'
)
df
=
create_model_dataframe
(
parsed_data
,
model_name
,
dataset_abbr
,
parallel
=
parallel_flag
)
score
=
visualize
(
df
,
save_path
,
model_name
,
dataset_abbr
)
model_datasets_scores
[
dataset_abbr
]
=
'{:.02f}'
.
format
(
score
)
overall_dataset_abbrs
=
multi_dataset_abbrs
+
origin_dataset_abbrs
+
parallel_dataset_abbrs
overall_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_overall.png'
)
merged_df
=
merge_dataframes
(
model_name
,
overall_dataset_abbrs
,
parsed_data
)
averaged_df
=
calculate_elementwise_average
(
model_name
,
merged_df
)
overall_score
=
visualize
(
averaged_df
,
overall_score_pic_path
,
model_name
,
'Overall Score'
)
# Single-Retrieval
single_retrieval_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_single_retrieval_overall.png'
)
single_retrieval_merged_df
=
merge_dataframes
(
model_name
,
origin_dataset_abbrs
,
parsed_data
)
single_retrieval_averaged_df
=
calculate_elementwise_average
(
model_name
,
single_retrieval_merged_df
)
single_retrieval_overall_score
=
visualize
(
single_retrieval_averaged_df
,
single_retrieval_score_pic_path
,
model_name
,
'Single-Retrieval Overall Score'
)
# Multi-Retrieval
multi_retrieval_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_multi_retrieval_overall.png'
)
multi_retrieval_merged_df
=
merge_dataframes
(
model_name
,
parallel_dataset_abbrs
,
parsed_data
)
multi_retrieval_averaged_df
=
calculate_elementwise_average
(
model_name
,
multi_retrieval_merged_df
)
multi_retrieval_overall_score
=
visualize
(
multi_retrieval_averaged_df
,
multi_retrieval_score_pic_path
,
model_name
,
'Multi-Retrieval Overall Score'
)
# Multi-Reasoning
multi_reasoning_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_multi_reasoning_overall.png'
)
multi_reasoning_merged_df
=
merge_dataframes
(
model_name
,
multi_dataset_abbrs
,
parsed_data
)
multi_reasoning_averaged_df
=
calculate_elementwise_average
(
model_name
,
multi_reasoning_merged_df
)
multi_reasoning_overall_score
=
visualize
(
multi_reasoning_averaged_df
,
multi_reasoning_score_pic_path
,
model_name
,
'Multi-Reasoning Overall Score'
)
model_scores
[
model_name
]
=
averaged_df
remove_empty_subfolders
(
plot_path
)
return
model_scores
def
visualize
(
df_raw
,
save_path
:
str
,
model_name
:
str
,
dataset_type
:
str
):
def
visualize
(
df_raw
,
save_path
:
str
,
model_name
:
str
,
dataset_type
:
str
):
df
=
df_raw
.
copy
()
df
=
df_raw
.
copy
()
if
df
.
empty
:
return
-
1
df
[
'Context Length'
]
=
df
[
'dataset'
].
apply
(
df
[
'Context Length'
]
=
df
[
'dataset'
].
apply
(
lambda
x
:
int
(
x
.
split
(
'Length'
)[
1
].
split
(
'Depth'
)[
0
]))
lambda
x
:
int
(
x
.
split
(
'Length'
)[
1
].
split
(
'Depth'
)[
0
]))
df
[
'Document Depth'
]
=
df
[
'dataset'
].
apply
(
df
[
'Document Depth'
]
=
df
[
'dataset'
].
apply
(
...
@@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
...
@@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
model_df
=
df
[[
'Document Depth'
,
'Context Length'
,
model_df
=
df
[[
'Document Depth'
,
'Context Length'
,
model_name
]].
copy
()
model_name
]].
copy
()
model_df
.
rename
(
columns
=
{
model_name
:
'Score'
},
inplace
=
True
)
model_df
.
rename
(
columns
=
{
model_name
:
'Score'
},
inplace
=
True
)
# Create pivot table
pivot_table
=
pd
.
pivot_table
(
model_df
,
pivot_table
=
pd
.
pivot_table
(
model_df
,
values
=
'Score'
,
values
=
'Score'
,
index
=
[
'Document Depth'
],
index
=
[
'Document Depth'
],
columns
=
[
'Context Length'
],
columns
=
[
'Context Length'
],
aggfunc
=
'mean'
)
aggfunc
=
'mean'
)
# Calculate mean scores
mean_scores
=
pivot_table
.
mean
().
values
mean_scores
=
pivot_table
.
mean
().
values
# Calculate overall score
overall_score
=
mean_scores
.
mean
()
overall_score
=
mean_scores
.
mean
()
plt
.
figure
(
figsize
=
(
10
,
6
))
# Create heatmap and line plot
plt
.
figure
(
figsize
=
(
15.5
,
8
))
ax
=
plt
.
gca
()
ax
=
plt
.
gca
()
cmap
=
LinearSegmentedColormap
.
from_list
(
cmap
=
LinearSegmentedColormap
.
from_list
(
'custom_cmap'
,
[
'#F0496E'
,
'#EBB839'
,
'#0CD79F'
])
'custom_cmap'
,
[
'#F0496E'
,
'#EBB839'
,
'#0CD79F'
])
# Draw heatmap
sns
.
heatmap
(
pivot_table
,
sns
.
heatmap
(
pivot_table
,
cmap
=
cmap
,
cmap
=
cmap
,
ax
=
ax
,
ax
=
ax
,
cbar_kws
=
{
'label'
:
'Score'
},
vmin
=
0
,
vmin
=
0
,
vmax
=
100
)
vmax
=
100
)
cbar
=
ax
.
collections
[
0
].
colorbar
# Set line plot data
x_data
=
[
i
+
0.5
for
i
in
range
(
len
(
mean_scores
))]
x_data
=
[
i
+
0.5
for
i
in
range
(
len
(
mean_scores
))]
y_data
=
mean_scores
y_data
=
mean_scores
# Create twin axis for line plot
ax2
=
ax
.
twinx
()
ax2
=
ax
.
twinx
()
# Draw line plot
ax2
.
plot
(
x_data
,
ax2
.
plot
(
x_data
,
y_data
,
y_data
,
color
=
'white'
,
color
=
'white'
,
marker
=
'o'
,
marker
=
'o'
,
linestyle
=
'-'
,
linestyle
=
'-'
,
linewidth
=
2
,
linewidth
=
2
,
markersize
=
8
,
markersize
=
8
,
label
=
'Average Depth Score'
)
label
=
'Average Depth Score'
# Set y-axis range
)
ax2
.
set_ylim
(
0
,
100
)
ax2
.
set_ylim
(
0
,
100
)
# Hide original y-axis ticks and labels
ax2
.
set_yticklabels
([])
ax2
.
set_yticklabels
([])
ax2
.
set_yticks
([])
ax2
.
set_yticks
([])
# Add legend
ax2
.
legend
(
loc
=
'lower left'
)
ax2
.
legend
(
loc
=
'upper left'
)
# Set chart title and labels
ax
.
set_title
(
f
'
{
model_name
}
{
dataset_type
}
Context '
'Performance
\n
Fact Retrieval Across '
'Context Lengths ("Needle In A Haystack")'
)
ax
.
set_xlabel
(
'Token Limit'
)
ax
.
set_ylabel
(
'Depth Percent'
)
ax
.
set_xticklabels
(
pivot_table
.
columns
.
values
,
rotation
=
45
)
ax
.
set_yticklabels
(
pivot_table
.
index
.
values
,
rotation
=
0
)
# Add overall score as a subtitle
plt
.
text
(
0.5
,
-
0.13
,
f
'Overall Score for
{
model_name
}
: '
f
'
{
overall_score
:.
2
f
}
'
,
ha
=
'center'
,
va
=
'center'
,
transform
=
ax
.
transAxes
,
fontsize
=
13
)
plt
.
tight_layout
()
plt
.
subplots_adjust
(
right
=
1
)
plt
.
draw
()
plt
.
savefig
(
save_path
)
print
(
f
'Saved :
{
save_path
}
'
)
plt
.
close
()
# Close figure to prevent memory leaks
return
overall_score
def
save_results_to_plots
(
txt_results_save_path
):
content
=
read_after_specific_line_except_last
(
txt_results_save_path
,
'raw format'
,
2
)
parsed_data
=
parse_model_scores
(
content
)
model_names
=
get_dict_model_names
(
parsed_data
)
numbers
=
[
2
,
3
,
4
,
5
]
languages
=
[
'en'
,
'zh'
]
size_exists
=
[]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
]
for
size
in
sizes_origin
:
if
size
in
content
:
size_exists
.
append
(
size
)
multi_dataset_abbrs
=
[
f
'
{
num
}
needle_
{
lang
}{
size
}
'
for
num
in
numbers
for
lang
in
languages
for
size
in
size_exists
]
origin_dataset_abbrs
=
[
f
'origin_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
parallel_dataset_abbrs
=
[
f
'parallel_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
dataset_abbrs
=
multi_dataset_abbrs
+
origin_dataset_abbrs
+
\
parallel_dataset_abbrs
base_path
=
os
.
path
.
dirname
(
txt_results_save_path
)
plot_path
=
os
.
path
.
join
(
base_path
,
'plots'
)
model_scores
=
{}
for
model_name
in
tqdm
(
model_names
):
model_datasets_scores
=
{}
# Dictionary to store scores for each dataset for the current model
for
dataset_abbr
in
dataset_abbrs
:
parallel_flag
=
'parallel'
in
dataset_abbr
# Create a directory for each dataset_abbr
if
model_name
in
model_name_mapping
:
folder_path
=
os
.
path
.
join
(
plot_path
,
dataset_abbr
)
title_name
=
model_name_mapping
[
model_name
]
ensure_directory
(
folder_path
)
else
:
title_name
=
model_name
# Construct the full path to save the image
ax
.
set_title
(
title_name
,
fontsize
=
12
,
fontweight
=
'bold'
,
pad
=
15
)
save_path
=
os
.
path
.
join
(
folder_path
,
f
'
{
model_name
}
.png'
)
# Create DataFrame for the model and dataset
if
dataset_type
in
dataset_mapping_dict
:
df
=
create_model_dataframe
(
parsed_data
,
model_name
,
dataset_abbr
,
parallel
=
parallel_flag
)
dataset_name
=
dataset_mapping_dict
[
dataset_type
]
else
:
dataset_name
=
dataset_type
ax
.
text
(
0.5
,
1.005
,
f
'
{
dataset_name
}
:
{
overall_score
:.
2
f
}
'
,
transform
=
ax
.
transAxes
,
ha
=
'center'
,
fontsize
=
12
,
fontweight
=
'normal'
)
ax
.
set_xlabel
(
'Token Length'
,
fontsize
=
13
,
fontweight
=
'normal'
,
labelpad
=
1
)
ax
.
set_ylabel
(
'Depth Percent(%)'
,
fontsize
=
13
,
fontweight
=
'normal'
,
labelpad
=
1
)
converted_labels
=
[
convert_to_k
(
value
)
for
value
in
pivot_table
.
columns
.
values
]
ax
.
tick_params
(
axis
=
'both'
,
which
=
'major'
,
length
=
1
,
pad
=
1
)
ax
.
tick_params
(
axis
=
'both'
,
which
=
'minor'
,
length
=
1
,
pad
=
1
)
ax
.
set_xticklabels
(
converted_labels
,
rotation
=
45
)
index_length
=
len
(
pivot_table
.
index
)
selected_indices
=
pivot_table
.
index
.
values
[::
2
]
labels
=
[
str
(
int
(
index
))
for
index
in
selected_indices
]
ax
.
set_yticks
(
np
.
arange
(
0
,
len
(
pivot_table
.
index
),
2
))
ax
.
set_yticklabels
(
labels
,
rotation
=
0
)
for
spine
in
ax
.
spines
.
values
():
spine
.
set_visible
(
False
)
for
spine
in
ax2
.
spines
.
values
():
spine
.
set_visible
(
False
)
# Generate visualization and get the score
plt
.
tight_layout
()
score
=
visualize
(
df
,
save_path
,
model_name
,
dataset_abbr
)
plt
.
draw
()
directory_path
,
original_filename
=
os
.
path
.
split
(
save_path
)
# Store the score in the dictionary
filename_suffix
=
(
title_name
+
'_'
+
dataset_name
).
replace
(
' '
,
'_'
)
model_datasets_scores
[
dataset_abbr
]
=
'{
:.02f}'
.
format
(
score
)
new_filename
=
f
'
{
filename_suffix
}
.png'
# Process and visualize the overall score
new_save_path
=
os
.
path
.
join
(
directory_path
,
new_filename
)
overall_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_overall.png'
)
merged_df
=
merge_dataframes
(
model_name
,
dataset_abbrs
,
parsed_data
)
p
rint
(
merg
e_
d
at
aframes
)
p
lt
.
savefig
(
new_sav
e_
p
at
h
,
format
=
'png'
,
bbox_inches
=
'tight'
,
pad_inches
=
0
)
averaged_df
=
calculate_elementwise_average
(
merged_df
)
print
(
f
'Saved :
{
new_save_path
}
'
)
# Assume visualize returns the average score for the overall visualization
plt
.
close
()
overall_score
=
visualize
(
averaged_df
,
overall_score_pic_path
,
'weighted_average_score'
,
'Overall Score'
)
# Add the overall score to the dictionary
return
overall_score
model_datasets_scores
[
'Overall'
]
=
'{:.02f}'
.
format
(
overall_score
)
# Add the model's scores to the main dictionary
model_scores
[
model_name
]
=
model_datasets_scores
def
ensure_directory
(
path
):
def
ensure_directory
(
path
):
if
not
os
.
path
.
exists
(
path
):
if
not
os
.
path
.
exists
(
path
):
...
@@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data):
...
@@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data):
merged_df
=
reduce
(
lambda
left
,
right
:
pd
.
merge
(
left
,
right
,
on
=
'dataset'
,
how
=
'outer'
),
dfs
)
merged_df
=
reduce
(
lambda
left
,
right
:
pd
.
merge
(
left
,
right
,
on
=
'dataset'
,
how
=
'outer'
),
dfs
)
if
merged_df
.
isnull
().
any
().
any
():
if
merged_df
.
isnull
().
any
().
any
():
print
(
'Warning: Some rows were filtered out due to NaN values. This is often due to mismatched row counts among DataFrames.'
)
print
(
'Warning: Some rows were filtered out due to NaN values. '
'This is often due to mismatched row counts among DataFrames.'
)
merged_df
=
merged_df
.
dropna
()
merged_df
=
merged_df
.
dropna
()
return
merged_df
return
merged_df
def
calculate_elementwise_average
(
merged_df
):
score_columns
=
[
col
for
col
in
merged_df
.
columns
if
col
!=
'dataset'
]
origin_columns
=
[
col
for
col
in
score_columns
if
'origin'
in
col
]
parallel_columns
=
[
col
for
col
in
score_columns
if
'parallel'
in
col
]
multi_columns
=
[
col
for
col
in
score_columns
if
'needle'
in
col
]
if
origin_columns
and
parallel_columns
and
multi_columns
:
origin_avg
=
merged_df
[
origin_columns
].
mean
(
axis
=
1
)
*
0.4
parallel_avg
=
merged_df
[
parallel_columns
].
mean
(
axis
=
1
)
*
0.3
multi_avg
=
merged_df
[
multi_columns
].
mean
(
axis
=
1
)
*
0.3
merged_df
[
'weighted_average_score'
]
=
origin_avg
+
parallel_avg
+
multi_avg
else
:
merged_df
[
'weighted_average_score'
]
=
pd
.
Series
([
0
]
*
len
(
merged_df
))
return
merged_df
.
iloc
[:,
[
0
,
-
1
]]
class
NeedleBenchSummarizer
(
DefaultSummarizer
):
class
NeedleBenchSummarizer
(
DefaultSummarizer
):
"""NeedleBench summarizer in OpenCompass.
"""NeedleBench summarizer in OpenCompass.
...
@@ -303,20 +408,17 @@ class NeedleBenchSummarizer(DefaultSummarizer):
...
@@ -303,20 +408,17 @@ class NeedleBenchSummarizer(DefaultSummarizer):
summarizer_dataset_abbrs
=
[]
summarizer_dataset_abbrs
=
[]
if
self
.
dataset_abbrs
is
None
:
if
self
.
dataset_abbrs
is
None
:
# display all dataset metrics included in the config
for
dataset_abbr
in
dataset_abbrs
:
for
dataset_abbr
in
dataset_abbrs
:
if
dataset_abbr
in
dataset_metrics
:
if
dataset_abbr
in
dataset_metrics
:
for
metric
in
dataset_metrics
[
dataset_abbr
]:
for
metric
in
dataset_metrics
[
dataset_abbr
]:
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
metric
))
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
metric
))
else
:
else
:
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
None
))
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
None
))
# along with all possible group metrics
for
dataset_abbr
in
dataset_metrics
:
for
dataset_abbr
in
dataset_metrics
:
for
metric
in
dataset_metrics
[
dataset_abbr
]:
for
metric
in
dataset_metrics
[
dataset_abbr
]:
if
(
dataset_abbr
,
metric
)
not
in
summarizer_dataset_abbrs
:
if
(
dataset_abbr
,
metric
)
not
in
summarizer_dataset_abbrs
:
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
metric
))
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
metric
))
else
:
else
:
# follow the required order
for
item
in
self
.
dataset_abbrs
:
for
item
in
self
.
dataset_abbrs
:
if
isinstance
(
item
,
str
):
if
isinstance
(
item
,
str
):
summarizer_dataset_abbrs
.
append
((
item
,
None
))
summarizer_dataset_abbrs
.
append
((
item
,
None
))
...
@@ -332,6 +434,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
...
@@ -332,6 +434,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
for
dataset_abbr
,
metric
in
summarizer_dataset_abbrs
:
for
dataset_abbr
,
metric
in
summarizer_dataset_abbrs
:
if
dataset_abbr
not
in
dataset_metrics
:
if
dataset_abbr
not
in
dataset_metrics
:
table
.
append
([
dataset_abbr
,
'-'
,
'-'
,
'-'
]
+
[
'-'
]
*
len
(
self
.
model_abbrs
))
table
.
append
([
dataset_abbr
,
'-'
,
'-'
,
'-'
]
+
[
'-'
]
*
len
(
self
.
model_abbrs
))
table
.
append
(
header
)
table
.
append
(
header
)
continue
continue
...
@@ -378,33 +481,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
...
@@ -378,33 +481,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
raw_txts
=
'
\n
'
.
join
(
raw_txts
)
raw_txts
=
'
\n
'
.
join
(
raw_txts
)
return
raw_txts
return
raw_txts
def
_read_and_sort_dataframe
(
self
,
file_path
):
# Read the file without treating the first row as a header
df
=
pd
.
read_csv
(
file_path
,
header
=
None
)
# Function to sort columns based on the value of a specific row, excluding the first column
def
sort_columns_based_on_row_corrected
(
df
,
base_row_idx
,
start_row_idx
,
end_row_idx
):
# Extract the rows for sorting
sort_values_row
=
df
.
iloc
[
base_row_idx
,
1
:].
replace
(
'-'
,
np
.
nan
).
apply
(
pd
.
to_numeric
,
errors
=
'coerce'
)
# Handle NaNs by setting them to a value less than the minimum or using a method to keep them at the end
min_possible_value
=
sort_values_row
.
min
(
skipna
=
True
)
-
1
# Use min value in the row minus 1 or another method
sort_values_row_filled
=
sort_values_row
.
fillna
(
min_possible_value
)
# Get the sorted order of indices, excluding the first column
sorted_col_indices
=
sort_values_row_filled
.
sort_values
(
ascending
=
False
).
index
# Apply the sorted column indices to the whole DataFrame, adjusting for Python's 0-based index
df
.
iloc
[
start_row_idx
:
end_row_idx
+
1
]
=
df
.
iloc
[
start_row_idx
:
end_row_idx
+
1
,
[
0
]
+
sorted_col_indices
.
tolist
()]
# Apply the corrected sorting function based on the description
sort_columns_based_on_row_corrected
(
df
,
1
,
0
,
2
)
# For rows 1-2 based on row 2's values
sort_columns_based_on_row_corrected
(
df
,
4
,
3
,
7
)
# For rows 4-7 based on row 5's values
sort_columns_based_on_row_corrected
(
df
,
9
,
8
,
12
)
# For rows 9-12 based on row 10's values
sort_columns_based_on_row_corrected
(
df
,
14
,
13
,
25
)
# For rows 14-25 based on row 15's values
# Return the sorted DataFrame
return
df
def
_output_to_file
(
self
,
output_path
,
time_str
,
table
,
raw_txts
):
def
_output_to_file
(
self
,
output_path
,
time_str
,
table
,
raw_txts
):
# output to file
if
output_path
is
None
:
if
output_path
is
None
:
output_path
=
osp
.
join
(
self
.
work_dir
,
'summary'
,
f
'summary_
{
time_str
}
.txt'
)
output_path
=
osp
.
join
(
self
.
work_dir
,
'summary'
,
f
'summary_
{
time_str
}
.txt'
)
output_csv_path
=
osp
.
join
(
self
.
work_dir
,
'summary'
,
f
'summary_
{
time_str
}
.csv'
)
output_csv_path
=
osp
.
join
(
self
.
work_dir
,
'summary'
,
f
'summary_
{
time_str
}
.csv'
)
...
@@ -436,38 +513,19 @@ class NeedleBenchSummarizer(DefaultSummarizer):
...
@@ -436,38 +513,19 @@ class NeedleBenchSummarizer(DefaultSummarizer):
f
.
write
(
'
\n
'
.
join
([
','
.
join
(
row
)
for
row
in
table
])
+
'
\n
'
)
f
.
write
(
'
\n
'
.
join
([
','
.
join
(
row
)
for
row
in
table
])
+
'
\n
'
)
self
.
logger
.
info
(
f
'write csv to
{
osp
.
abspath
(
output_csv_path
)
}
'
)
self
.
logger
.
info
(
f
'write csv to
{
osp
.
abspath
(
output_csv_path
)
}
'
)
df_sorted
=
self
.
_read_and_sort_dataframe
(
output_csv_path
)
sorted_file_path
=
osp
.
abspath
(
output_csv_path
).
split
(
'.'
)[
0
]
+
'_sorted.csv'
df_sorted
.
to_csv
(
sorted_file_path
,
index
=
False
,
header
=
False
)
self
.
logger
.
info
(
f
'write sorted csv to
{
sorted_file_path
}
'
)
def
summarize
(
def
summarize
(
self
,
self
,
output_path
:
str
=
None
,
output_path
:
str
=
None
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
# noqa
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
# noqa
# pick up results
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
=
self
.
_pick_up_results
()
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
=
self
.
_pick_up_results
()
# calculate group metrics
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
=
\
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
=
\
self
.
_calculate_group_metrics
(
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
)
self
.
_calculate_group_metrics
(
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
)
# format table
table
=
self
.
_format_table
(
parsed_results
,
dataset_metrics
,
dataset_eval_mode
)
table
=
self
.
_format_table
(
parsed_results
,
dataset_metrics
,
dataset_eval_mode
)
# format raw txt
raw_txts
=
self
.
_format_raw_txt
(
raw_results
)
raw_txts
=
self
.
_format_raw_txt
(
raw_results
)
# output to screen
print
(
tabulate
.
tabulate
(
table
,
headers
=
'firstrow'
))
print
(
tabulate
.
tabulate
(
table
,
headers
=
'firstrow'
))
# output to .text / .csv files
self
.
_output_to_file
(
output_path
,
time_str
,
table
,
raw_txts
)
self
.
_output_to_file
(
output_path
,
time_str
,
table
,
raw_txts
)
if
self
.
lark_reporter
:
if
self
.
lark_reporter
:
content
=
f
'
{
getpass
.
getuser
()
}
的'
content
=
f
'
{
getpass
.
getuser
()
}
的'
content
+=
f
'详细评测汇总已输出至
{
osp
.
abspath
(
output_path
)
}
'
content
+=
f
'详细评测汇总已输出至
{
osp
.
abspath
(
output_path
)
}
'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment