Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
16f29b25
Unverified
Commit
16f29b25
authored
Apr 07, 2024
by
Mo Li
Committed by
GitHub
Apr 07, 2024
Browse files
[Fix] Simplify needlebench summarizer (#1024)
* Conflicts: configs/summarizers/needlebench.py * fix lint problems
parent
f2af4933
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
505 additions
and
865 deletions
+505
-865
configs/datasets/needlebench/atc/atc_choice_50.py
configs/datasets/needlebench/atc/atc_choice_50.py
+43
-0
configs/datasets/needlebench/atc/atc_choice_80.py
configs/datasets/needlebench/atc/atc_choice_80.py
+43
-0
configs/summarizers/needlebench.py
configs/summarizers/needlebench.py
+188
-692
opencompass/summarizers/needlebench.py
opencompass/summarizers/needlebench.py
+231
-173
No files found.
configs/datasets/needlebench/atc/atc_choice_50.py
0 → 100644
View file @
16f29b25
from
mmengine.config
import
read_base
with
read_base
():
from
.atc_choice_20
import
*
needle_num_list
=
list
(
range
(
2
,
50
,
1
))
needlebench_datasets
=
[]
for
_name
in
list
(
single_choice_prompts
.
keys
()):
needlebench_atc_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
(
single_choice_prompts
[
_name
])),
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,),
)
needlebench_atc_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
CircularEvaluator
),
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
))
for
num_needles
in
needle_num_list
:
abbr
=
(
f
'NeedleBenchATCDataset-'
f
'
{
num_needles
}
Needle-
{
"EN"
if
"en"
in
_name
else
"ZH"
}
'
)
language
=
"English"
if
"en"
in
_name
else
"Chinese"
if
'reasoning'
in
_name
:
abbr
+=
'-Reasoning'
dataset_dict
=
{
'abbr'
:
abbr
,
'type'
:
NeedleBenchATCDataset
,
'path'
:
names_path
,
'num_needles'
:
num_needles
,
'language'
:
language
,
'repeats'
:
repeats
,
'with_circular'
:
with_circular_eval
,
'reader_cfg'
:
needlebench_atc_reader_cfg
,
'infer_cfg'
:
needlebench_atc_infer_cfg
,
'eval_cfg'
:
needlebench_atc_eval_cfg
}
needlebench_datasets
.
append
(
dataset_dict
)
configs/datasets/needlebench/atc/atc_choice_80.py
0 → 100644
View file @
16f29b25
from
mmengine.config
import
read_base
with
read_base
():
from
.atc_choice_20
import
*
needle_num_list
=
list
(
range
(
2
,
80
,
1
))
needlebench_datasets
=
[]
for
_name
in
list
(
single_choice_prompts
.
keys
()):
needlebench_atc_infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
template
=
dict
(
round
=
(
single_choice_prompts
[
_name
])),
),
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
GenInferencer
,),
)
needlebench_atc_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
CircularEvaluator
),
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
))
for
num_needles
in
needle_num_list
:
abbr
=
(
f
'NeedleBenchATCDataset-'
f
'
{
num_needles
}
Needle-
{
"EN"
if
"en"
in
_name
else
"ZH"
}
'
)
language
=
"English"
if
"en"
in
_name
else
"Chinese"
if
'reasoning'
in
_name
:
abbr
+=
'-Reasoning'
dataset_dict
=
{
'abbr'
:
abbr
,
'type'
:
NeedleBenchATCDataset
,
'path'
:
names_path
,
'num_needles'
:
num_needles
,
'language'
:
language
,
'repeats'
:
repeats
,
'with_circular'
:
with_circular_eval
,
'reader_cfg'
:
needlebench_atc_reader_cfg
,
'infer_cfg'
:
needlebench_atc_infer_cfg
,
'eval_cfg'
:
needlebench_atc_eval_cfg
}
needlebench_datasets
.
append
(
dataset_dict
)
configs/summarizers/needlebench.py
View file @
16f29b25
from
opencompass.summarizers.needlebench
import
NeedleBenchSummarizer
from
opencompass.summarizers.needlebench
import
NeedleBenchATCSummarizer
# ----------NeedleBench-4k-summarizer----------
context_lengths_4k
=
list
(
range
(
1000
,
5000
,
1000
))
depths
=
[
0
,
5
,
10
,
15
,
21
,
26
,
31
,
36
,
42
,
47
,
52
,
57
,
63
,
68
,
73
,
78
,
84
,
89
,
94
,
100
]
depths_list_sparse
=
[
0
,
10
,
21
,
31
,
42
,
52
,
63
,
73
,
84
,
94
,
100
]
# Initialize the lists
_needlebench_4k_2needle_en
=
[]
_needlebench_4k_3needle_en
=
[]
_needlebench_4k_4needle_en
=
[]
_needlebench_4k_5needle_en
=
[]
_needlebench_4k_2needle_zh
=
[]
_needlebench_4k_3needle_zh
=
[]
_needlebench_4k_4needle_zh
=
[]
_needlebench_4k_5needle_zh
=
[]
_needlebench_4k_origin_en
=
[]
_needlebench_4k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_4k
:
for
depth_percent
in
depths
:
_needlebench_4k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_4k'
)
_needlebench_4k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_4k'
)
_needlebench_4k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_4k'
)
_needlebench_4k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_4k'
)
_needlebench_4k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_4k'
)
_needlebench_4k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_4k'
)
_needlebench_4k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_4k'
)
_needlebench_4k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_4k'
)
_needlebench_4k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_4k'
)
_needlebench_4k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_4k'
)
# Concatenate the multi-needle and origin lists
_needlebench_4k_multi_needle_en
=
_needlebench_4k_2needle_en
+
_needlebench_4k_3needle_en
+
_needlebench_4k_4needle_en
+
_needlebench_4k_5needle_en
_needlebench_4k_multi_needle_zh
=
_needlebench_4k_2needle_zh
+
_needlebench_4k_3needle_zh
+
_needlebench_4k_4needle_zh
+
_needlebench_4k_5needle_zh
_needlebench_4k_origin
=
_needlebench_4k_origin_en
+
_needlebench_4k_origin_zh
_needlebench_4k_multi_needle
=
_needlebench_4k_multi_needle_en
+
_needlebench_4k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_4k_parallel_en
=
[]
_needlebench_4k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_4k
:
_needlebench_4k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_4k'
)
for
original_context_length
in
context_lengths_4k
:
_needlebench_4k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_4k'
)
_needlebench_4k_parallel
=
_needlebench_4k_parallel_en
+
_needlebench_4k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_4k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_4k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_4k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_4k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_4k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_4k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_4k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_4k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_4k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_4k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_4k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_4k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_4k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_4k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_4k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_4k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_4k_parallel_en
},
def
create_m_rs_names_list
(
context_lengths
,
depths
,
needle_counts
,
languages
,
dataset_size
):
names_dict
=
{}
multi_needle_list
=
[]
multi_needle_en_list
=
[]
multi_needle_zh_list
=
[]
for
needle_count
in
needle_counts
:
for
language
in
languages
:
key
=
f
"
{
needle_count
}
-Needle-
{
language
.
upper
()
}
-
{
dataset_size
.
upper
()
}
"
names_list
=
[
f
"Length
{
length
}
Depth
{
int
(
depth
)
}
_
{
needle_count
}
needle_
{
language
}
_
{
dataset_size
}
"
for
length
in
context_lengths
for
depth
in
depths
]
names_dict
[
key
]
=
names_list
multi_needle_list
.
extend
(
names_list
)
if
language
==
'en'
:
multi_needle_en_list
.
extend
(
names_list
)
elif
language
==
'zh'
:
multi_needle_zh_list
.
extend
(
names_list
)
names_dict
[
'Multi-Needle-Reasoning(M-RS)'
]
=
multi_needle_list
names_dict
[
'Multi-Needle-Reasoning-EN'
]
=
multi_needle_en_list
names_dict
[
'Multi-Needle-Reasoning-ZH'
]
=
multi_needle_zh_list
return
names_dict
def
create_summarizer
(
context_lengths
,
depths
,
dataset_size
,
sparse_depths
=
None
):
needle_counts
=
[
"2"
,
"3"
,
"4"
,
"5"
]
languages
=
[
"en"
,
"zh"
]
if
sparse_depths
:
depths
=
sparse_depths
names_dict
=
{}
multi_reasoning_names
=
create_m_rs_names_list
(
context_lengths
,
depths
,
needle_counts
,
languages
,
dataset_size
)
names_dict
.
update
(
multi_reasoning_names
)
single_needle_list
=
[]
single_needle_en_list
=
[]
single_needle_zh_list
=
[]
for
language
in
languages
:
names_list
=
[
f
"Length
{
length
}
Depth
{
int
(
depth
)
}
_origin_
{
language
}
_
{
dataset_size
}
"
for
length
in
context_lengths
for
depth
in
depths
]
single_needle_list
.
extend
(
names_list
)
if
language
==
'en'
:
single_needle_en_list
.
extend
(
names_list
)
elif
language
==
'zh'
:
single_needle_zh_list
.
extend
(
names_list
)
names_dict
[
'Single-Needle-Retrieval(S-RT)'
]
=
single_needle_list
names_dict
[
'Single-Needle-Retrieval-EN'
]
=
single_needle_en_list
names_dict
[
'Single-Needle-Retrieval-ZH'
]
=
single_needle_zh_list
parallel_list
=
[]
parallel_en_list
=
[]
parallel_zh_list
=
[]
for
language
in
languages
:
names_list
=
[
f
"Length
{
length
}
_parallel_
{
language
}
_
{
dataset_size
}
"
for
length
in
context_lengths
]
parallel_list
.
extend
(
names_list
)
if
language
==
'en'
:
parallel_en_list
.
extend
(
names_list
)
elif
language
==
'zh'
:
parallel_zh_list
.
extend
(
names_list
)
names_dict
[
'Multi-Needle-Retrieval(M-RT)'
]
=
parallel_list
names_dict
[
'Multi-Needle-Retrieval-EN'
]
=
parallel_en_list
names_dict
[
'Multi-Needle-Retrieval-ZH'
]
=
parallel_zh_list
summary_groups
=
[
{
'name'
:
key
,
'subsets'
:
value
}
for
key
,
value
in
names_dict
.
items
()
]
summary_groups
.
append
({
'name'
:
'NeedleBench-Overall-Score'
,
'subsets'
:
[[
'Single-Needle-Retrieval(S-RT)'
,
'naive_average'
],
[
'Multi-Needle-Reasoning(M-RS)'
,
'naive_average'
],
[
'Multi-Needle-Retrieval(M-RT)'
,
'average_score'
]],
'weights'
:
{
'Single-Needle-Retrieval(S-RT)'
:
0.4
,
'Multi-Needle-Reasoning(M-RS)'
:
0.3
,
'Multi-Needle-Retrieval(M-RT)'
:
0.3
}})
summarizer_config
=
{
'type'
:
NeedleBenchSummarizer
,
'summary_groups'
:
summary_groups
,
'dataset_abbrs'
:
[
'NeedleBench-Overall-Score'
,
f
'--------- NeedleBench-
{
dataset_size
.
upper
()
}
-Single-Needle-Retrieval ---------'
,
'Single-Needle-Retrieval(S-RT)'
,
'Single-Needle-Retrieval-EN'
,
'Single-Needle-Retrieval-ZH'
,
f
'--------- NeedleBench-
{
dataset_size
.
upper
()
}
-Multi-Needle-Retrieval ---------'
,
'Multi-Needle-Retrieval(M-RT)'
,
'Multi-Needle-Retrieval-EN'
,
'Multi-Needle-Retrieval-ZH'
,
f
'--------- NeedleBench-
{
dataset_size
.
upper
()
}
-Multi-Needle-Reasoning ---------'
,
'Multi-Needle-Reasoning(M-RS)'
,
'Multi-Needle-Reasoning-EN'
,
'Multi-Needle-Reasoning-ZH'
,
'2-Needle-EN-4K'
,
'2-Needle-ZH-4K'
,
'3-Needle-EN-4K'
,
'3-Needle-ZH-4K'
,
'4-Needle-EN-4K'
,
'4-Needle-ZH-4K'
,
'5-Needle-EN-4K'
,
'5-Needle-ZH-4K'
,
]
}
return
summarizer_config
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_4k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-4k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-4k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-4k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-8k-summarizer----------
depths
=
[
0
,
5
,
10
,
15
,
21
,
26
,
31
,
36
,
42
,
47
,
52
,
57
,
63
,
68
,
73
,
78
,
84
,
89
,
94
,
100
]
depths_list_sparse
=
[
0
,
10
,
21
,
31
,
42
,
52
,
63
,
73
,
84
,
94
,
100
]
context_lengths_4k
=
list
(
range
(
1000
,
5000
,
1000
))
needlebench_4k_summarizer
=
create_summarizer
(
context_lengths_4k
,
depths
,
"4k"
)
context_lengths_8k
=
list
(
range
(
5000
,
9000
,
1000
))
# Initialize the lists
_needlebench_8k_2needle_en
=
[]
_needlebench_8k_3needle_en
=
[]
_needlebench_8k_4needle_en
=
[]
_needlebench_8k_5needle_en
=
[]
_needlebench_8k_2needle_zh
=
[]
_needlebench_8k_3needle_zh
=
[]
_needlebench_8k_4needle_zh
=
[]
_needlebench_8k_5needle_zh
=
[]
_needlebench_8k_origin_en
=
[]
_needlebench_8k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_8k
:
for
depth_percent
in
depths
:
_needlebench_8k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_8k'
)
_needlebench_8k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_8k'
)
_needlebench_8k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_8k'
)
_needlebench_8k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_8k'
)
_needlebench_8k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_8k'
)
_needlebench_8k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_8k'
)
_needlebench_8k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_8k'
)
_needlebench_8k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_8k'
)
_needlebench_8k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_8k'
)
_needlebench_8k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_8k'
)
# Concatenate the multi-needle and origin lists
_needlebench_8k_multi_needle_en
=
_needlebench_8k_2needle_en
+
_needlebench_8k_3needle_en
+
_needlebench_8k_4needle_en
+
_needlebench_8k_5needle_en
_needlebench_8k_multi_needle_zh
=
_needlebench_8k_2needle_zh
+
_needlebench_8k_3needle_zh
+
_needlebench_8k_4needle_zh
+
_needlebench_8k_5needle_zh
_needlebench_8k_origin
=
_needlebench_8k_origin_en
+
_needlebench_8k_origin_zh
_needlebench_8k_multi_needle
=
_needlebench_8k_multi_needle_en
+
_needlebench_8k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en
=
[]
_needlebench_8k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_8k
:
_needlebench_8k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_8k'
)
for
original_context_length
in
context_lengths_8k
:
_needlebench_8k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_8k'
)
_needlebench_8k_parallel
=
_needlebench_8k_parallel_en
+
_needlebench_8k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_8k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_8k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_8k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_8k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_8k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_8k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_8k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_8k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_8k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_8k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_8k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_8k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_8k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_8k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_8k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_8k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_8k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_8k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-8k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-8k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-8k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-32k-summarizer----------
needlebench_8k_summarizer
=
create_summarizer
(
context_lengths_8k
,
depths
,
"8k"
)
context_lengths_32k
=
[
9000
,
13000
,
17000
,
21000
,
25000
,
29000
,
31000
,
32000
]
# Initialize the lists
_needlebench_32k_2needle_en
=
[]
_needlebench_32k_3needle_en
=
[]
_needlebench_32k_4needle_en
=
[]
_needlebench_32k_5needle_en
=
[]
_needlebench_32k_2needle_zh
=
[]
_needlebench_32k_3needle_zh
=
[]
_needlebench_32k_4needle_zh
=
[]
_needlebench_32k_5needle_zh
=
[]
_needlebench_32k_origin_en
=
[]
_needlebench_32k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_32k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_32k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_32k'
)
_needlebench_32k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_32k'
)
_needlebench_32k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_32k'
)
_needlebench_32k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_32k'
)
_needlebench_32k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_32k'
)
_needlebench_32k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_32k'
)
_needlebench_32k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_32k'
)
_needlebench_32k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_32k'
)
_needlebench_32k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_32k'
)
_needlebench_32k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_32k'
)
# Concatenate the multi-needle and origin lists
_needlebench_32k_multi_needle_en
=
_needlebench_32k_2needle_en
+
_needlebench_32k_3needle_en
+
_needlebench_32k_4needle_en
+
_needlebench_32k_5needle_en
_needlebench_32k_multi_needle_zh
=
_needlebench_32k_2needle_zh
+
_needlebench_32k_3needle_zh
+
_needlebench_32k_4needle_zh
+
_needlebench_32k_5needle_zh
_needlebench_32k_origin
=
_needlebench_32k_origin_en
+
_needlebench_32k_origin_zh
_needlebench_32k_multi_needle
=
_needlebench_32k_multi_needle_en
+
_needlebench_32k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_32k_parallel_en
=
[]
_needlebench_32k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_32k
:
_needlebench_32k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_32k'
)
for
original_context_length
in
context_lengths_32k
:
_needlebench_32k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_32k'
)
_needlebench_32k_parallel
=
_needlebench_32k_parallel_en
+
_needlebench_32k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_32k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_32k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_32k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_32k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_32k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_32k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_32k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_32k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_32k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_32k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_32k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_32k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_32k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_32k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_32k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_32k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_32k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_32k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-32k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-32k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-32k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-128k-summarizer----------
needlebench_32k_summarizer
=
create_summarizer
(
context_lengths_32k
,
depths_list_sparse
,
"32k"
)
context_lengths_128k
=
list
([
16000
,
32000
,
48000
,
64000
,
80000
,
96000
,
112000
,
128000
])
# Initialize the lists
_needlebench_128k_2needle_en
=
[]
_needlebench_128k_3needle_en
=
[]
_needlebench_128k_4needle_en
=
[]
_needlebench_128k_5needle_en
=
[]
_needlebench_128k_2needle_zh
=
[]
_needlebench_128k_3needle_zh
=
[]
_needlebench_128k_4needle_zh
=
[]
_needlebench_128k_5needle_zh
=
[]
_needlebench_128k_origin_en
=
[]
_needlebench_128k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_128k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_128k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_128k'
)
_needlebench_128k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_128k'
)
_needlebench_128k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_128k'
)
_needlebench_128k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_128k'
)
_needlebench_128k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_128k'
)
_needlebench_128k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_128k'
)
_needlebench_128k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_128k'
)
_needlebench_128k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_128k'
)
_needlebench_128k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_128k'
)
_needlebench_128k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_128k'
)
# Concatenate the multi-needle and origin lists
_needlebench_128k_multi_needle_en
=
_needlebench_128k_2needle_en
+
_needlebench_128k_3needle_en
+
_needlebench_128k_4needle_en
+
_needlebench_128k_5needle_en
_needlebench_128k_multi_needle_zh
=
_needlebench_128k_2needle_zh
+
_needlebench_128k_3needle_zh
+
_needlebench_128k_4needle_zh
+
_needlebench_128k_5needle_zh
_needlebench_128k_origin
=
_needlebench_128k_origin_en
+
_needlebench_128k_origin_zh
_needlebench_128k_multi_needle
=
_needlebench_128k_multi_needle_en
+
_needlebench_128k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_128k_parallel_en
=
[]
_needlebench_128k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_128k
:
_needlebench_128k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_128k'
)
for
original_context_length
in
context_lengths_128k
:
_needlebench_128k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_128k'
)
_needlebench_128k_parallel
=
_needlebench_128k_parallel_en
+
_needlebench_128k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_128k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_128k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_128k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_128k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_128k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_128k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_128k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_128k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_128k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_128k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_128k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_128k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_128k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_128k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_128k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_128k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_128k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_128k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-128k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-128k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-128k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-200k-summarizer----------
needlebench_128k_summarizer
=
create_summarizer
(
context_lengths_128k
,
depths_list_sparse
,
"128k"
)
context_lengths_200k
=
list
([
16000
,
48000
,
80000
,
112000
,
128000
,
144000
,
176000
,
200000
])
# Initialize the lists
_needlebench_200k_2needle_en
=
[]
_needlebench_200k_3needle_en
=
[]
_needlebench_200k_4needle_en
=
[]
_needlebench_200k_5needle_en
=
[]
_needlebench_200k_2needle_zh
=
[]
_needlebench_200k_3needle_zh
=
[]
_needlebench_200k_4needle_zh
=
[]
_needlebench_200k_5needle_zh
=
[]
_needlebench_200k_origin_en
=
[]
_needlebench_200k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_200k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_200k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_200k'
)
_needlebench_200k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_200k'
)
_needlebench_200k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_200k'
)
_needlebench_200k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_200k'
)
_needlebench_200k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_200k'
)
_needlebench_200k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_200k'
)
_needlebench_200k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_200k'
)
_needlebench_200k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_200k'
)
_needlebench_200k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_200k'
)
_needlebench_200k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_200k'
)
# Concatenate the multi-needle and origin lists
_needlebench_200k_multi_needle_en
=
_needlebench_200k_2needle_en
+
_needlebench_200k_3needle_en
+
_needlebench_200k_4needle_en
+
_needlebench_200k_5needle_en
_needlebench_200k_multi_needle_zh
=
_needlebench_200k_2needle_zh
+
_needlebench_200k_3needle_zh
+
_needlebench_200k_4needle_zh
+
_needlebench_200k_5needle_zh
_needlebench_200k_origin
=
_needlebench_200k_origin_en
+
_needlebench_200k_origin_zh
_needlebench_200k_multi_needle
=
_needlebench_200k_multi_needle_en
+
_needlebench_200k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_200k_parallel_en
=
[]
_needlebench_200k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_200k
:
_needlebench_200k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_200k'
)
for
original_context_length
in
context_lengths_200k
:
_needlebench_200k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_200k'
)
_needlebench_200k_parallel
=
_needlebench_200k_parallel_en
+
_needlebench_200k_parallel_zh
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_200k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_200k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_200k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_200k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_200k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_200k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_200k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_200k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_200k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_200k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_200k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_200k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_200k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_200k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_200k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_200k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_200k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_200k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-200k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-200k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-200k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
# ----------NeedleBench-1000k-summarizer----------
needlebench_200k_summarizer
=
create_summarizer
(
context_lengths_200k
,
depths_list_sparse
,
"200k"
)
context_lengths_1000k
=
list
([
20000
,
160000
,
300000
,
440000
,
580000
,
720000
,
860000
,
1000000
])
# Initialize the lists
_needlebench_1000k_2needle_en
=
[]
_needlebench_1000k_3needle_en
=
[]
_needlebench_1000k_4needle_en
=
[]
_needlebench_1000k_5needle_en
=
[]
_needlebench_1000k_2needle_zh
=
[]
_needlebench_1000k_3needle_zh
=
[]
_needlebench_1000k_4needle_zh
=
[]
_needlebench_1000k_5needle_zh
=
[]
_needlebench_1000k_origin_en
=
[]
_needlebench_1000k_origin_zh
=
[]
# Fill the lists using nested loops
for
original_context_length
in
context_lengths_1000k
:
for
depth_percent
in
depths_list_sparse
:
_needlebench_1000k_2needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_en_1000k'
)
_needlebench_1000k_3needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_en_1000k'
)
_needlebench_1000k_4needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_en_1000k'
)
_needlebench_1000k_5needle_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_en_1000k'
)
_needlebench_1000k_2needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_2needle_zh_1000k'
)
_needlebench_1000k_3needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_3needle_zh_1000k'
)
_needlebench_1000k_4needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_4needle_zh_1000k'
)
_needlebench_1000k_5needle_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_5needle_zh_1000k'
)
_needlebench_1000k_origin_en
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_en_1000k'
)
_needlebench_1000k_origin_zh
.
append
(
f
'Length
{
original_context_length
}
Depth
{
int
(
depth_percent
)
}
_origin_zh_1000k'
)
# Concatenate the multi-needle and origin lists
_needlebench_1000k_multi_needle_en
=
_needlebench_1000k_2needle_en
+
_needlebench_1000k_3needle_en
+
_needlebench_1000k_4needle_en
+
_needlebench_1000k_5needle_en
_needlebench_1000k_multi_needle_zh
=
_needlebench_1000k_2needle_zh
+
_needlebench_1000k_3needle_zh
+
_needlebench_1000k_4needle_zh
+
_needlebench_1000k_5needle_zh
_needlebench_1000k_origin
=
_needlebench_1000k_origin_en
+
_needlebench_1000k_origin_zh
_needlebench_1000k_multi_needle
=
_needlebench_1000k_multi_needle_en
+
_needlebench_1000k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_1000k_parallel_en
=
[]
_needlebench_1000k_parallel_zh
=
[]
for
original_context_length
in
context_lengths_1000k
:
_needlebench_1000k_parallel_en
.
append
(
f
'Length
{
original_context_length
}
_parallel_en_1000k'
)
for
original_context_length
in
context_lengths_1000k
:
_needlebench_1000k_parallel_zh
.
append
(
f
'Length
{
original_context_length
}
_parallel_zh_1000k'
)
_needlebench_1000k_parallel
=
_needlebench_1000k_parallel_en
+
_needlebench_1000k_parallel_zh
needlebench_1000k_summarizer
=
create_summarizer
(
context_lengths_1000k
,
depths_list_sparse
,
"1000k"
)
needlebench_summary_groups
=
[
{
'name'
:
'original_version'
,
'subsets'
:
_needlebench_1000k_origin
},
{
'name'
:
'original_version_zh'
,
'subsets'
:
_needlebench_1000k_origin_zh
},
{
'name'
:
'original_version_en'
,
'subsets'
:
_needlebench_1000k_origin_en
},
{
'name'
:
'multi_needle_en'
,
'subsets'
:
_needlebench_1000k_multi_needle_en
},
{
'name'
:
'multi_needle2_en'
,
'subsets'
:
_needlebench_1000k_2needle_en
},
{
'name'
:
'multi_needle3_en'
,
'subsets'
:
_needlebench_1000k_3needle_en
},
{
'name'
:
'multi_needle4_en'
,
'subsets'
:
_needlebench_1000k_4needle_en
},
{
'name'
:
'multi_needle5_en'
,
'subsets'
:
_needlebench_1000k_5needle_en
},
{
'name'
:
'multi_needle_zh'
,
'subsets'
:
_needlebench_1000k_multi_needle_zh
},
{
'name'
:
'multi_needle2_zh'
,
'subsets'
:
_needlebench_1000k_2needle_zh
},
{
'name'
:
'multi_needle3_zh'
,
'subsets'
:
_needlebench_1000k_3needle_zh
},
{
'name'
:
'multi_needle4_zh'
,
'subsets'
:
_needlebench_1000k_4needle_zh
},
{
'name'
:
'multi_needle5_zh'
,
'subsets'
:
_needlebench_1000k_5needle_zh
},
{
'name'
:
'multi_needle'
,
'subsets'
:
_needlebench_1000k_multi_needle
},
{
'name'
:
'parallel_version'
,
'subsets'
:
_needlebench_1000k_parallel
},
{
'name'
:
'parallel_version_zh'
,
'subsets'
:
_needlebench_1000k_parallel_zh
},
{
'name'
:
'parallel_version_en'
,
'subsets'
:
_needlebench_1000k_parallel_en
},
{
'name'
:
'overall'
,
'subsets'
:
[[
'original_version'
,
'naive_average'
],
[
'multi_needle'
,
'naive_average'
],
[
'parallel_version'
,
'average_score'
]],
'weights'
:
{
'original_version'
:
0.4
,
'multi_needle'
:
0.3
,
'parallel_version'
:
0.3
}},
]
needlebench_1000k_summarizer
=
dict
(
type
=
NeedleBenchSummarizer
,
dataset_abbrs
=
[
'overall'
,
'--------- NeedleBench-1000k Single-Needle ---------'
,
# category
'original_version'
,
'original_version_zh'
,
'original_version_en'
,
'--------- NeedleBench-1000k Parallel-Needles ---------'
,
# category
'parallel_version'
,
'parallel_version_zh'
,
'parallel_version_en'
,
'--------- NeedleBench-1000k Multi-Needles ---------'
,
# category
'multi_needle'
,
'multi_needle_en'
,
'multi_needle_zh'
,
'multi_needle2_en'
,
'multi_needle3_en'
,
'multi_needle4_en'
,
'multi_needle5_en'
,
'multi_needle2_zh'
,
'multi_needle3_zh'
,
'multi_needle4_zh'
,
'multi_needle5_zh'
,
# *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
context_lengths_8k
=
list
(
range
(
5000
,
9000
,
1000
))
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en_batch1
=
[]
_needlebench_8k_parallel_en_batch5
=
[]
_needlebench_8k_parallel_en_batch10
=
[]
...
...
@@ -713,7 +202,6 @@ needlebench_8k_batch_overall_summarizer = dict(
'parallel_version_en_batch15'
,
'parallel_version_zh_batch20'
,
'parallel_version_en_batch20'
,
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
...
...
@@ -754,64 +242,72 @@ needlebench_8k_batch_depth0_summarizer = dict(
'parallel_version_en_batch15'
,
'parallel_version_zh_batch20'
,
'parallel_version_en_batch20'
,
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups
=
needlebench_summary_groups
,
)
needle_num_list
=
list
(
range
(
2
,
20
,
1
))
categories
=
[
'ZH'
,
'EN'
,
'ZH-Reasoning'
,
'EN-Reasoning'
,
'ZH-CircularEval'
,
'EN-CircularEval'
,
'ZH-Reasoning-Circular'
,
'EN-Reasoning-Circular'
]
needlebench_atc_summary_groups
=
[]
for
category
in
categories
:
metric
=
'perf_4'
if
'CircularEval'
in
category
else
'acc_1'
cleaned_category
=
category
.
replace
(
'-CircularEval'
,
''
).
replace
(
'-Circular'
,
''
)
subsets
=
[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
for
num_needles
in
needle_num_list
]
def
gen_atc_summarizer
(
needle_num_list
):
categories
=
[
'ZH-Direct-CE'
,
'EN-Direct-CE'
,
'ZH-Reasoning-CE'
,
'EN-Reasoning-CE'
]
needlebench_atc_summary_groups
=
[]
# 根据分类生成summary groups
for
category
in
categories
:
# 对于CircularEval相关的评分,使用perf_4指标,否则使用acc_1指标
metric
=
'perf_4'
if
'CE'
in
category
else
'acc_1'
# 生成subsets时,不需要在数据集名称中包含CircularEval信息
cleaned_category
=
category
.
replace
(
'-CE'
,
''
).
replace
(
'-Direct'
,
''
)
needlebench_atc_summary_groups
.
append
({
'name'
:
category
,
'subsets'
:
[
[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
,
metric
]
for
num_needles
in
needle_num_list
],
'weights'
:
{
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
:
num_needles
for
num_needles
in
needle_num_list
},
})
needlebench_atc_summary_groups
.
append
({
'name'
:
category
,
'name'
:
'ATC-CE-Overall'
,
'subsets'
:
[
[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-
{
cleaned_category
}
'
,
metric
]
for
num_needles
in
needle_num_list
]
})
atc_dataset_abbrs
=
[]
for
category
in
categories
:
title
=
f
'######## Needlebench-ATC-
{
category
}
-Score ########'
atc_dataset_abbrs
.
append
(
title
)
weighted_average_score_entry
=
[
f
'
{
category
}
'
,
'weighted_average'
]
atc_dataset_abbrs
.
append
(
weighted_average_score_entry
)
if
atc_dataset_abbrs
[
-
1
]
==
'------------------------------------------'
:
atc_dataset_abbrs
.
pop
()
needlebench_atc_summarizer
=
dict
(
dataset_abbrs
=
[
*
atc_dataset_abbrs
,
'######## Needlebench-ATC Accuracy ########'
,
# category
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
'######## Needlebench-ATC CircularEval ########'
,
# category
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
],
summary_groups
=
needlebench_atc_summary_groups
)
[
f
'
{
category
}
'
,
'weighted_average'
]
for
category
in
categories
],
})
atc_dataset_abbrs
=
[]
atc_dataset_abbrs
.
append
([
'ATC-CE-Overall'
,
'naive_average'
])
for
category
in
categories
:
weighted_average_score_entry
=
[
f
'
{
category
}
'
,
'weighted_average'
]
atc_dataset_abbrs
.
append
(
weighted_average_score_entry
)
needlebench_atc_summarizer
=
dict
(
dataset_abbrs
=
[
*
atc_dataset_abbrs
,
'######## Needlebench-ATC Accuracy ########'
,
# category
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'acc_1'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
'######## Needlebench-ATC CircularEval ########'
,
# category
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-ZH-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
*
[[
f
'NeedleBenchATCDataset-
{
num_needles
}
Needle-EN-Reasoning'
,
'perf_4'
]
for
num_needles
in
needle_num_list
],
'------------------------------------------'
,
],
summary_groups
=
needlebench_atc_summary_groups
)
return
needlebench_atc_summarizer
atc_summarizer_20
=
gen_atc_summarizer
(
list
(
range
(
2
,
20
,
1
)))
atc_summarizer_50
=
gen_atc_summarizer
(
list
(
range
(
2
,
50
,
1
)))
atc_summarizer_80
=
gen_atc_summarizer
(
list
(
range
(
2
,
80
,
1
)))
opencompass/summarizers/needlebench.py
View file @
16f29b25
...
...
@@ -5,6 +5,7 @@ import getpass
import
math
import
os
import
os.path
as
osp
import
shutil
from
datetime
import
datetime
from
typing
import
Any
,
Dict
,
List
,
Optional
...
...
@@ -26,6 +27,92 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
model_abbr_from_cfg
)
from
opencompass.utils.prompt
import
get_prompt_hash
model_name_mapping
=
{
'llama-2-7b-chat-hf'
:
'LLaMA-2-7B'
,
'llama-2-13b-chat-hf'
:
'LLaMA-2-13B'
,
'llama-2-70b-chat-hf'
:
'LLaMA-2-70B'
,
'baichuan2-7b-chat-hf'
:
'Baichuan2-7B'
,
'baichuan2-13b-chat-hf'
:
'Baichuan2-13B'
,
'yi-6b-chat-hf'
:
'Yi-6B'
,
'yi-34b-chat-hf'
:
'Yi-34B'
,
'deepseek-67b-chat-hf'
:
'DeepSeek-67B'
,
'wizardlm-70b-v1.0-vllm'
:
'WizardLM-70B'
,
'qwen-14b-chat-hf'
:
'Qwen-14B'
,
'qwen-72b-chat-hf'
:
'Qwen-72B'
,
'qwen-72b-chat-vllm'
:
'Qwen-72B-vLLM'
,
'internlm2-chat-7b-turbomind'
:
'InternLM2-7B-200K'
,
'internlm2-chat-20b-turbomind'
:
'InternLM2-20B-200K'
,
'internlm2-chat-7b-hf'
:
'InternLM2-7B'
,
'internlm2-chat-20b-hf'
:
'InternLM2-20B'
,
'qwen-7b-chat-hf'
:
'Qwen-7B'
,
'chatglm3-6b-hf'
:
'ChatGLM3-6B'
,
'chatglm3-6b-32k-hf'
:
'ChatGLM3-6B-32K'
,
'zephyr-7b-beta-vllm'
:
'Zephyr-7B Beta'
,
'mistral-7b-instruct-v0.2-vllm'
:
'Mistral-7B Inst. v0.2'
,
'mistral-7b-instruct-v0.1-vllm'
:
'Mistral-7B Inst. v0.1'
,
'mixtral-8x7b-instruct-v0.1-vllm'
:
'Mixtral-8x7B Inst. v0.1'
,
'orionstar-yi-34b-chat-hf'
:
'OrionStar-Yi-34B'
,
'orionstar-14b-long-chat-vllm'
:
'Orion-14B-LongChat'
,
'internlm-chat-7b-hf'
:
'InternLM-7B'
,
'gemma-2b-it-hf'
:
'Gemma-2B'
,
'gemma-7b-it-hf'
:
'Gemma-7B'
,
'qwen1.5-0.5b-chat-hf'
:
'Qwen-1.5-0.5B'
,
'qwen1.5-1.8b-chat-hf'
:
'Qwen-1.5-1.8B'
,
'qwen1.5-4b-chat-hf'
:
'Qwen-1.5-4B'
,
'qwen1.5-14b-chat-hf'
:
'Qwen-1.5-14B'
,
'qwen1.5-72b-chat-hf'
:
'Qwen-1.5-72B'
,
'qwen1.5-14b-chat-vllm'
:
'Qwen-1.5-14B-vLLM'
,
'qwen1.5-72b-chat-vllm'
:
'Qwen-1.5-72B-vLLM'
,
'glm4_notools'
:
'GLM-4'
,
'claude-3-opus'
:
'Claude-3-Opus'
,
# Add more mappings as necessary
}
dataset_mapping_dict
=
{}
needle_counts
=
[
'2'
,
'3'
,
'4'
,
'5'
]
languages
=
[
'en'
,
'zh'
]
sizes
=
[
'4k'
,
'8k'
,
'32k'
,
'200k'
,
'1000k'
]
types
=
[
'origin'
,
'parallel'
]
for
needle_count
in
needle_counts
:
for
language
in
languages
:
for
size
in
sizes
:
key
=
f
'
{
needle_count
}
needle_
{
language
}
_
{
size
}
'
value
=
f
'
{
needle_count
}
-Needle-Reasoning-
{
language
.
upper
()
}
-
{
size
.
upper
()
}
'
dataset_mapping_dict
[
key
]
=
value
for
t
in
types
:
for
language
in
languages
:
for
size
in
sizes
:
if
t
==
'origin'
:
key
=
f
'
{
t
}
_
{
language
}
_
{
size
}
'
value
=
f
'Single-Needle-Retrieval-
{
language
.
upper
()
}
-
{
size
.
upper
()
}
'
elif
t
==
'parallel'
:
key
=
f
'
{
t
}
_
{
language
}
_
{
size
}
'
value
=
f
'Multi-Needle-Retrieval-
{
language
.
upper
()
}
-
{
size
.
upper
()
}
'
dataset_mapping_dict
[
key
]
=
value
def
calculate_elementwise_average
(
model_name
,
merged_df
):
score_columns
=
[
col
for
col
in
merged_df
.
columns
if
col
!=
'dataset'
]
origin_columns
=
[
col
for
col
in
score_columns
if
'origin'
in
col
]
parallel_columns
=
[
col
for
col
in
score_columns
if
'parallel'
in
col
]
multi_columns
=
[
col
for
col
in
score_columns
if
'needle'
in
col
]
if
origin_columns
and
parallel_columns
and
multi_columns
:
origin_avg
=
merged_df
[
origin_columns
].
mean
(
axis
=
1
)
*
0.4
parallel_avg
=
merged_df
[
parallel_columns
].
mean
(
axis
=
1
)
*
0.3
multi_avg
=
merged_df
[
multi_columns
].
mean
(
axis
=
1
)
*
0.3
merged_df
[
model_name
]
=
origin_avg
+
parallel_avg
+
multi_avg
else
:
relevant_columns
=
origin_columns
or
parallel_columns
or
multi_columns
if
relevant_columns
:
merged_df
[
model_name
]
=
merged_df
[
relevant_columns
].
mean
(
axis
=
1
)
else
:
merged_df
[
model_name
]
=
pd
.
Series
([
0
]
*
len
(
merged_df
))
return
merged_df
.
iloc
[:,
[
0
,
-
1
]]
def
read_after_specific_line_except_last
(
file_name
,
keyword
,
offset
):
with
open
(
file_name
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
...
...
@@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False
df
=
pd
.
DataFrame
(
data
,
columns
=
[
'dataset'
,
model_name
])
return
df
def
convert_to_k
(
value
):
try
:
return
f
'
{
int
(
value
)
//
1000
}
k'
except
ValueError
:
return
value
def
parse_model_scores
(
text
):
lines
=
text
.
split
(
'
\n
'
)
...
...
@@ -82,8 +175,86 @@ def parse_model_scores(text):
return
result_dict
def
remove_empty_subfolders
(
plot_path
):
for
folder_name
in
tqdm
(
os
.
listdir
(
plot_path
),
desc
=
'Deleting Empty folders'
):
folder_path
=
os
.
path
.
join
(
plot_path
,
folder_name
)
if
os
.
path
.
isdir
(
folder_path
):
if
not
os
.
listdir
(
folder_path
):
shutil
.
rmtree
(
folder_path
)
def
save_results_to_plots
(
txt_results_save_path
):
content
=
read_after_specific_line_except_last
(
txt_results_save_path
,
'raw format'
,
2
)
parsed_data
=
parse_model_scores
(
content
)
model_names
=
get_dict_model_names
(
parsed_data
)
numbers
=
[
2
,
3
,
4
,
5
]
languages
=
[
'en'
,
'zh'
]
size_exists
=
[]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
,
'_1000k'
]
for
size
in
sizes_origin
:
if
size
in
content
:
size_exists
.
append
(
size
)
multi_dataset_abbrs
=
[
f
'
{
num
}
needle_
{
lang
}{
size
}
'
for
num
in
numbers
for
lang
in
languages
for
size
in
size_exists
]
origin_dataset_abbrs
=
[
f
'origin_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
parallel_dataset_abbrs
=
[
f
'parallel_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
dataset_abbrs
=
multi_dataset_abbrs
+
origin_dataset_abbrs
+
\
parallel_dataset_abbrs
base_path
=
os
.
path
.
dirname
(
txt_results_save_path
)
plot_path
=
os
.
path
.
join
(
base_path
,
'plots'
)
model_scores
=
{}
for
model_name
in
tqdm
(
model_names
):
model_datasets_scores
=
{}
# Dictionary to store scores for each dataset for the current model
for
dataset_abbr
in
dataset_abbrs
:
parallel_flag
=
'parallel'
in
dataset_abbr
folder_path
=
os
.
path
.
join
(
plot_path
,
dataset_mapping_dict
[
dataset_abbr
])
ensure_directory
(
folder_path
)
save_path
=
os
.
path
.
join
(
folder_path
,
f
'
{
model_name
}
.png'
)
df
=
create_model_dataframe
(
parsed_data
,
model_name
,
dataset_abbr
,
parallel
=
parallel_flag
)
score
=
visualize
(
df
,
save_path
,
model_name
,
dataset_abbr
)
model_datasets_scores
[
dataset_abbr
]
=
'{:.02f}'
.
format
(
score
)
overall_dataset_abbrs
=
multi_dataset_abbrs
+
origin_dataset_abbrs
+
parallel_dataset_abbrs
overall_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_overall.png'
)
merged_df
=
merge_dataframes
(
model_name
,
overall_dataset_abbrs
,
parsed_data
)
averaged_df
=
calculate_elementwise_average
(
model_name
,
merged_df
)
overall_score
=
visualize
(
averaged_df
,
overall_score_pic_path
,
model_name
,
'Overall Score'
)
# Single-Retrieval
single_retrieval_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_single_retrieval_overall.png'
)
single_retrieval_merged_df
=
merge_dataframes
(
model_name
,
origin_dataset_abbrs
,
parsed_data
)
single_retrieval_averaged_df
=
calculate_elementwise_average
(
model_name
,
single_retrieval_merged_df
)
single_retrieval_overall_score
=
visualize
(
single_retrieval_averaged_df
,
single_retrieval_score_pic_path
,
model_name
,
'Single-Retrieval Overall Score'
)
# Multi-Retrieval
multi_retrieval_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_multi_retrieval_overall.png'
)
multi_retrieval_merged_df
=
merge_dataframes
(
model_name
,
parallel_dataset_abbrs
,
parsed_data
)
multi_retrieval_averaged_df
=
calculate_elementwise_average
(
model_name
,
multi_retrieval_merged_df
)
multi_retrieval_overall_score
=
visualize
(
multi_retrieval_averaged_df
,
multi_retrieval_score_pic_path
,
model_name
,
'Multi-Retrieval Overall Score'
)
# Multi-Reasoning
multi_reasoning_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_multi_reasoning_overall.png'
)
multi_reasoning_merged_df
=
merge_dataframes
(
model_name
,
multi_dataset_abbrs
,
parsed_data
)
multi_reasoning_averaged_df
=
calculate_elementwise_average
(
model_name
,
multi_reasoning_merged_df
)
multi_reasoning_overall_score
=
visualize
(
multi_reasoning_averaged_df
,
multi_reasoning_score_pic_path
,
model_name
,
'Multi-Reasoning Overall Score'
)
model_scores
[
model_name
]
=
averaged_df
remove_empty_subfolders
(
plot_path
)
return
model_scores
def
visualize
(
df_raw
,
save_path
:
str
,
model_name
:
str
,
dataset_type
:
str
):
df
=
df_raw
.
copy
()
if
df
.
empty
:
return
-
1
df
[
'Context Length'
]
=
df
[
'dataset'
].
apply
(
lambda
x
:
int
(
x
.
split
(
'Length'
)[
1
].
split
(
'Depth'
)[
0
]))
df
[
'Document Depth'
]
=
df
[
'dataset'
].
apply
(
...
...
@@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
model_df
=
df
[[
'Document Depth'
,
'Context Length'
,
model_name
]].
copy
()
model_df
.
rename
(
columns
=
{
model_name
:
'Score'
},
inplace
=
True
)
# Create pivot table
pivot_table
=
pd
.
pivot_table
(
model_df
,
values
=
'Score'
,
index
=
[
'Document Depth'
],
columns
=
[
'Context Length'
],
aggfunc
=
'mean'
)
values
=
'Score'
,
index
=
[
'Document Depth'
],
columns
=
[
'Context Length'
],
aggfunc
=
'mean'
)
# Calculate mean scores
mean_scores
=
pivot_table
.
mean
().
values
# Calculate overall score
overall_score
=
mean_scores
.
mean
()
# Create heatmap and line plot
plt
.
figure
(
figsize
=
(
15.5
,
8
))
plt
.
figure
(
figsize
=
(
10
,
6
))
ax
=
plt
.
gca
()
cmap
=
LinearSegmentedColormap
.
from_list
(
'custom_cmap'
,
[
'#F0496E'
,
'#EBB839'
,
'#0CD79F'
])
# Draw heatmap
sns
.
heatmap
(
pivot_table
,
cmap
=
cmap
,
ax
=
ax
,
cbar_kws
=
{
'label'
:
'Score'
},
vmin
=
0
,
vmax
=
100
)
# Set line plot data
cbar
=
ax
.
collections
[
0
].
colorbar
x_data
=
[
i
+
0.5
for
i
in
range
(
len
(
mean_scores
))]
y_data
=
mean_scores
# Create twin axis for line plot
ax2
=
ax
.
twinx
()
# Draw line plot
ax2
.
plot
(
x_data
,
y_data
,
color
=
'white'
,
marker
=
'o'
,
linestyle
=
'-'
,
linewidth
=
2
,
markersize
=
8
,
label
=
'Average Depth Score'
)
# Set y-axis range
y_data
,
color
=
'white'
,
marker
=
'o'
,
linestyle
=
'-'
,
linewidth
=
2
,
markersize
=
8
,
label
=
'Average Depth Score'
)
ax2
.
set_ylim
(
0
,
100
)
# Hide original y-axis ticks and labels
ax2
.
set_yticklabels
([])
ax2
.
set_yticks
([])
# Add legend
ax2
.
legend
(
loc
=
'upper left'
)
# Set chart title and labels
ax
.
set_title
(
f
'
{
model_name
}
{
dataset_type
}
Context '
'Performance
\n
Fact Retrieval Across '
'Context Lengths ("Needle In A Haystack")'
)
ax
.
set_xlabel
(
'Token Limit'
)
ax
.
set_ylabel
(
'Depth Percent'
)
ax
.
set_xticklabels
(
pivot_table
.
columns
.
values
,
rotation
=
45
)
ax
.
set_yticklabels
(
pivot_table
.
index
.
values
,
rotation
=
0
)
# Add overall score as a subtitle
plt
.
text
(
0.5
,
-
0.13
,
f
'Overall Score for
{
model_name
}
: '
f
'
{
overall_score
:.
2
f
}
'
,
ha
=
'center'
,
va
=
'center'
,
transform
=
ax
.
transAxes
,
fontsize
=
13
)
plt
.
tight_layout
()
plt
.
subplots_adjust
(
right
=
1
)
plt
.
draw
()
plt
.
savefig
(
save_path
)
print
(
f
'Saved :
{
save_path
}
'
)
plt
.
close
()
# Close figure to prevent memory leaks
return
overall_score
def
save_results_to_plots
(
txt_results_save_path
):
content
=
read_after_specific_line_except_last
(
txt_results_save_path
,
'raw format'
,
2
)
parsed_data
=
parse_model_scores
(
content
)
model_names
=
get_dict_model_names
(
parsed_data
)
numbers
=
[
2
,
3
,
4
,
5
]
languages
=
[
'en'
,
'zh'
]
size_exists
=
[]
sizes_origin
=
[
'_4k'
,
'_8k'
,
'_32k'
,
'_128k'
,
'_200k'
]
for
size
in
sizes_origin
:
if
size
in
content
:
size_exists
.
append
(
size
)
multi_dataset_abbrs
=
[
f
'
{
num
}
needle_
{
lang
}{
size
}
'
for
num
in
numbers
for
lang
in
languages
for
size
in
size_exists
]
origin_dataset_abbrs
=
[
f
'origin_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
parallel_dataset_abbrs
=
[
f
'parallel_
{
lang
}{
size
}
'
for
lang
in
languages
for
size
in
size_exists
]
dataset_abbrs
=
multi_dataset_abbrs
+
origin_dataset_abbrs
+
\
parallel_dataset_abbrs
base_path
=
os
.
path
.
dirname
(
txt_results_save_path
)
plot_path
=
os
.
path
.
join
(
base_path
,
'plots'
)
model_scores
=
{}
for
model_name
in
tqdm
(
model_names
):
model_datasets_scores
=
{}
# Dictionary to store scores for each dataset for the current model
for
dataset_abbr
in
dataset_abbrs
:
parallel_flag
=
'parallel'
in
dataset_abbr
ax2
.
legend
(
loc
=
'lower left'
)
# Create a directory for each dataset_abbr
folder_path
=
os
.
path
.
join
(
plot_path
,
dataset_abbr
)
ensure_directory
(
folder_path
)
if
model_name
in
model_name_mapping
:
title_name
=
model_name_mapping
[
model_name
]
else
:
title_name
=
model_name
# Construct the full path to save the image
save_path
=
os
.
path
.
join
(
folder_path
,
f
'
{
model_name
}
.png'
)
ax
.
set_title
(
title_name
,
fontsize
=
12
,
fontweight
=
'bold'
,
pad
=
15
)
# Create DataFrame for the model and dataset
df
=
create_model_dataframe
(
parsed_data
,
model_name
,
dataset_abbr
,
parallel
=
parallel_flag
)
if
dataset_type
in
dataset_mapping_dict
:
dataset_name
=
dataset_mapping_dict
[
dataset_type
]
else
:
dataset_name
=
dataset_type
ax
.
text
(
0.5
,
1.005
,
f
'
{
dataset_name
}
:
{
overall_score
:.
2
f
}
'
,
transform
=
ax
.
transAxes
,
ha
=
'center'
,
fontsize
=
12
,
fontweight
=
'normal'
)
ax
.
set_xlabel
(
'Token Length'
,
fontsize
=
13
,
fontweight
=
'normal'
,
labelpad
=
1
)
ax
.
set_ylabel
(
'Depth Percent(%)'
,
fontsize
=
13
,
fontweight
=
'normal'
,
labelpad
=
1
)
converted_labels
=
[
convert_to_k
(
value
)
for
value
in
pivot_table
.
columns
.
values
]
ax
.
tick_params
(
axis
=
'both'
,
which
=
'major'
,
length
=
1
,
pad
=
1
)
ax
.
tick_params
(
axis
=
'both'
,
which
=
'minor'
,
length
=
1
,
pad
=
1
)
ax
.
set_xticklabels
(
converted_labels
,
rotation
=
45
)
index_length
=
len
(
pivot_table
.
index
)
selected_indices
=
pivot_table
.
index
.
values
[::
2
]
labels
=
[
str
(
int
(
index
))
for
index
in
selected_indices
]
ax
.
set_yticks
(
np
.
arange
(
0
,
len
(
pivot_table
.
index
),
2
))
ax
.
set_yticklabels
(
labels
,
rotation
=
0
)
for
spine
in
ax
.
spines
.
values
():
spine
.
set_visible
(
False
)
for
spine
in
ax2
.
spines
.
values
():
spine
.
set_visible
(
False
)
# Generate visualization and get the score
score
=
visualize
(
df
,
save_path
,
model_name
,
dataset_abbr
)
plt
.
tight_layout
()
plt
.
draw
()
directory_path
,
original_filename
=
os
.
path
.
split
(
save_path
)
# Store the score in the dictionary
model_datasets_scores
[
dataset_abbr
]
=
'{
:.02f}'
.
format
(
score
)
filename_suffix
=
(
title_name
+
'_'
+
dataset_name
).
replace
(
' '
,
'_'
)
new_filename
=
f
'
{
filename_suffix
}
.png'
# Process and visualize the overall score
overall_score_pic_path
=
os
.
path
.
join
(
plot_path
,
f
'
{
model_name
}
_overall.png'
)
merged_df
=
merge_dataframes
(
model_name
,
dataset_abbrs
,
parsed_data
)
new_save_path
=
os
.
path
.
join
(
directory_path
,
new_filename
)
p
rint
(
merg
e_
d
at
aframes
)
averaged_df
=
calculate_elementwise_average
(
merged_df
)
p
lt
.
savefig
(
new_sav
e_
p
at
h
,
format
=
'png'
,
bbox_inches
=
'tight'
,
pad_inches
=
0
)
print
(
f
'Saved :
{
new_save_path
}
'
)
# Assume visualize returns the average score for the overall visualization
overall_score
=
visualize
(
averaged_df
,
overall_score_pic_path
,
'weighted_average_score'
,
'Overall Score'
)
plt
.
close
()
# Add the overall score to the dictionary
model_datasets_scores
[
'Overall'
]
=
'{:.02f}'
.
format
(
overall_score
)
return
overall_score
# Add the model's scores to the main dictionary
model_scores
[
model_name
]
=
model_datasets_scores
def
ensure_directory
(
path
):
if
not
os
.
path
.
exists
(
path
):
...
...
@@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data):
merged_df
=
reduce
(
lambda
left
,
right
:
pd
.
merge
(
left
,
right
,
on
=
'dataset'
,
how
=
'outer'
),
dfs
)
if
merged_df
.
isnull
().
any
().
any
():
print
(
'Warning: Some rows were filtered out due to NaN values. This is often due to mismatched row counts among DataFrames.'
)
print
(
'Warning: Some rows were filtered out due to NaN values. '
'This is often due to mismatched row counts among DataFrames.'
)
merged_df
=
merged_df
.
dropna
()
return
merged_df
def
calculate_elementwise_average
(
merged_df
):
score_columns
=
[
col
for
col
in
merged_df
.
columns
if
col
!=
'dataset'
]
origin_columns
=
[
col
for
col
in
score_columns
if
'origin'
in
col
]
parallel_columns
=
[
col
for
col
in
score_columns
if
'parallel'
in
col
]
multi_columns
=
[
col
for
col
in
score_columns
if
'needle'
in
col
]
if
origin_columns
and
parallel_columns
and
multi_columns
:
origin_avg
=
merged_df
[
origin_columns
].
mean
(
axis
=
1
)
*
0.4
parallel_avg
=
merged_df
[
parallel_columns
].
mean
(
axis
=
1
)
*
0.3
multi_avg
=
merged_df
[
multi_columns
].
mean
(
axis
=
1
)
*
0.3
merged_df
[
'weighted_average_score'
]
=
origin_avg
+
parallel_avg
+
multi_avg
else
:
merged_df
[
'weighted_average_score'
]
=
pd
.
Series
([
0
]
*
len
(
merged_df
))
return
merged_df
.
iloc
[:,
[
0
,
-
1
]]
class
NeedleBenchSummarizer
(
DefaultSummarizer
):
"""NeedleBench summarizer in OpenCompass.
...
...
@@ -303,20 +408,17 @@ class NeedleBenchSummarizer(DefaultSummarizer):
summarizer_dataset_abbrs
=
[]
if
self
.
dataset_abbrs
is
None
:
# display all dataset metrics included in the config
for
dataset_abbr
in
dataset_abbrs
:
if
dataset_abbr
in
dataset_metrics
:
for
metric
in
dataset_metrics
[
dataset_abbr
]:
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
metric
))
else
:
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
None
))
# along with all possible group metrics
for
dataset_abbr
in
dataset_metrics
:
for
metric
in
dataset_metrics
[
dataset_abbr
]:
if
(
dataset_abbr
,
metric
)
not
in
summarizer_dataset_abbrs
:
summarizer_dataset_abbrs
.
append
((
dataset_abbr
,
metric
))
else
:
# follow the required order
for
item
in
self
.
dataset_abbrs
:
if
isinstance
(
item
,
str
):
summarizer_dataset_abbrs
.
append
((
item
,
None
))
...
...
@@ -332,6 +434,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
for
dataset_abbr
,
metric
in
summarizer_dataset_abbrs
:
if
dataset_abbr
not
in
dataset_metrics
:
table
.
append
([
dataset_abbr
,
'-'
,
'-'
,
'-'
]
+
[
'-'
]
*
len
(
self
.
model_abbrs
))
table
.
append
(
header
)
continue
...
...
@@ -378,33 +481,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
raw_txts
=
'
\n
'
.
join
(
raw_txts
)
return
raw_txts
def
_read_and_sort_dataframe
(
self
,
file_path
):
# Read the file without treating the first row as a header
df
=
pd
.
read_csv
(
file_path
,
header
=
None
)
# Function to sort columns based on the value of a specific row, excluding the first column
def
sort_columns_based_on_row_corrected
(
df
,
base_row_idx
,
start_row_idx
,
end_row_idx
):
# Extract the rows for sorting
sort_values_row
=
df
.
iloc
[
base_row_idx
,
1
:].
replace
(
'-'
,
np
.
nan
).
apply
(
pd
.
to_numeric
,
errors
=
'coerce'
)
# Handle NaNs by setting them to a value less than the minimum or using a method to keep them at the end
min_possible_value
=
sort_values_row
.
min
(
skipna
=
True
)
-
1
# Use min value in the row minus 1 or another method
sort_values_row_filled
=
sort_values_row
.
fillna
(
min_possible_value
)
# Get the sorted order of indices, excluding the first column
sorted_col_indices
=
sort_values_row_filled
.
sort_values
(
ascending
=
False
).
index
# Apply the sorted column indices to the whole DataFrame, adjusting for Python's 0-based index
df
.
iloc
[
start_row_idx
:
end_row_idx
+
1
]
=
df
.
iloc
[
start_row_idx
:
end_row_idx
+
1
,
[
0
]
+
sorted_col_indices
.
tolist
()]
# Apply the corrected sorting function based on the description
sort_columns_based_on_row_corrected
(
df
,
1
,
0
,
2
)
# For rows 1-2 based on row 2's values
sort_columns_based_on_row_corrected
(
df
,
4
,
3
,
7
)
# For rows 4-7 based on row 5's values
sort_columns_based_on_row_corrected
(
df
,
9
,
8
,
12
)
# For rows 9-12 based on row 10's values
sort_columns_based_on_row_corrected
(
df
,
14
,
13
,
25
)
# For rows 14-25 based on row 15's values
# Return the sorted DataFrame
return
df
def
_output_to_file
(
self
,
output_path
,
time_str
,
table
,
raw_txts
):
# output to file
if
output_path
is
None
:
output_path
=
osp
.
join
(
self
.
work_dir
,
'summary'
,
f
'summary_
{
time_str
}
.txt'
)
output_csv_path
=
osp
.
join
(
self
.
work_dir
,
'summary'
,
f
'summary_
{
time_str
}
.csv'
)
...
...
@@ -436,38 +513,19 @@ class NeedleBenchSummarizer(DefaultSummarizer):
f
.
write
(
'
\n
'
.
join
([
','
.
join
(
row
)
for
row
in
table
])
+
'
\n
'
)
self
.
logger
.
info
(
f
'write csv to
{
osp
.
abspath
(
output_csv_path
)
}
'
)
df_sorted
=
self
.
_read_and_sort_dataframe
(
output_csv_path
)
sorted_file_path
=
osp
.
abspath
(
output_csv_path
).
split
(
'.'
)[
0
]
+
'_sorted.csv'
df_sorted
.
to_csv
(
sorted_file_path
,
index
=
False
,
header
=
False
)
self
.
logger
.
info
(
f
'write sorted csv to
{
sorted_file_path
}
'
)
def
summarize
(
self
,
output_path
:
str
=
None
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
# noqa
# pick up results
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
=
self
.
_pick_up_results
()
# calculate group metrics
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
=
\
self
.
_calculate_group_metrics
(
raw_results
,
parsed_results
,
dataset_metrics
,
dataset_eval_mode
)
# format table
table
=
self
.
_format_table
(
parsed_results
,
dataset_metrics
,
dataset_eval_mode
)
# format raw txt
raw_txts
=
self
.
_format_raw_txt
(
raw_results
)
# output to screen
print
(
tabulate
.
tabulate
(
table
,
headers
=
'firstrow'
))
# output to .text / .csv files
self
.
_output_to_file
(
output_path
,
time_str
,
table
,
raw_txts
)
if
self
.
lark_reporter
:
content
=
f
'
{
getpass
.
getuser
()
}
的'
content
+=
f
'详细评测汇总已输出至
{
osp
.
abspath
(
output_path
)
}
'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment