Unverified Commit 16f29b25 authored by Mo Li's avatar Mo Li Committed by GitHub
Browse files

[Fix] Simplify needlebench summarizer (#1024)

* Conflicts:
	configs/summarizers/needlebench.py

* fix lint problems
parent f2af4933
from mmengine.config import read_base
with read_base():
from .atc_choice_20 import *
needle_num_list = list(range(2, 50, 1))
needlebench_datasets = []
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = "English" if "en" in _name else "Chinese"
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': names_path,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)
from mmengine.config import read_base
with read_base():
from .atc_choice_20 import *
needle_num_list = list(range(2, 80, 1))
needlebench_datasets = []
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = "English" if "en" in _name else "Chinese"
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': names_path,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)
from opencompass.summarizers.needlebench import NeedleBenchSummarizer
from opencompass.summarizers.needlebench import NeedleBenchATCSummarizer
# ----------NeedleBench-4k-summarizer----------
context_lengths_4k = list(range(1000, 5000, 1000))
depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100]
depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# Initialize the lists
_needlebench_4k_2needle_en = []
_needlebench_4k_3needle_en = []
_needlebench_4k_4needle_en = []
_needlebench_4k_5needle_en = []
_needlebench_4k_2needle_zh = []
_needlebench_4k_3needle_zh = []
_needlebench_4k_4needle_zh = []
_needlebench_4k_5needle_zh = []
_needlebench_4k_origin_en = []
_needlebench_4k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_4k:
for depth_percent in depths:
_needlebench_4k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_4k')
_needlebench_4k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_4k')
_needlebench_4k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_4k')
_needlebench_4k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_4k')
_needlebench_4k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_4k')
_needlebench_4k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_4k')
_needlebench_4k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_4k')
_needlebench_4k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_4k')
_needlebench_4k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_4k')
_needlebench_4k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_4k')
# Concatenate the multi-needle and origin lists
_needlebench_4k_multi_needle_en = _needlebench_4k_2needle_en + _needlebench_4k_3needle_en + _needlebench_4k_4needle_en + _needlebench_4k_5needle_en
_needlebench_4k_multi_needle_zh = _needlebench_4k_2needle_zh + _needlebench_4k_3needle_zh + _needlebench_4k_4needle_zh + _needlebench_4k_5needle_zh
_needlebench_4k_origin = _needlebench_4k_origin_en + _needlebench_4k_origin_zh
_needlebench_4k_multi_needle = _needlebench_4k_multi_needle_en + _needlebench_4k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_4k_parallel_en = []
_needlebench_4k_parallel_zh = []
for original_context_length in context_lengths_4k:
_needlebench_4k_parallel_en.append(f'Length{original_context_length}_parallel_en_4k')
for original_context_length in context_lengths_4k:
_needlebench_4k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_4k')
_needlebench_4k_parallel = _needlebench_4k_parallel_en + _needlebench_4k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_4k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_4k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_4k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_4k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_4k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_4k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_4k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_4k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_4k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_4k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_4k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_4k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_4k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_4k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_4k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_4k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_4k_parallel_en},
def create_m_rs_names_list(context_lengths, depths, needle_counts,
languages, dataset_size):
names_dict = {}
multi_needle_list = []
multi_needle_en_list = []
multi_needle_zh_list = []
for needle_count in needle_counts:
for language in languages:
key = f"{needle_count}-Needle-{language.upper()}-{dataset_size.upper()}"
names_list = [
f"Length{length}Depth{int(depth)}_{needle_count}needle_{language}_{dataset_size}"
for length in context_lengths
for depth in depths
]
names_dict[key] = names_list
multi_needle_list.extend(names_list)
if language == 'en':
multi_needle_en_list.extend(names_list)
elif language == 'zh':
multi_needle_zh_list.extend(names_list)
names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list
names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list
names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list
return names_dict
def create_summarizer(context_lengths, depths, dataset_size,
sparse_depths=None):
needle_counts = ["2", "3", "4", "5"]
languages = ["en", "zh"]
if sparse_depths:
depths = sparse_depths
names_dict = {}
multi_reasoning_names = create_m_rs_names_list(
context_lengths, depths, needle_counts, languages, dataset_size)
names_dict.update(multi_reasoning_names)
single_needle_list = []
single_needle_en_list = []
single_needle_zh_list = []
for language in languages:
names_list = [
f"Length{length}Depth{int(depth)}_origin_{language}_{dataset_size}"
for length in context_lengths
for depth in depths
]
single_needle_list.extend(names_list)
if language == 'en':
single_needle_en_list.extend(names_list)
elif language == 'zh':
single_needle_zh_list.extend(names_list)
names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list
names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list
names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list
parallel_list = []
parallel_en_list = []
parallel_zh_list = []
for language in languages:
names_list = [
f"Length{length}_parallel_{language}_{dataset_size}"
for length in context_lengths
]
parallel_list.extend(names_list)
if language == 'en':
parallel_en_list.extend(names_list)
elif language == 'zh':
parallel_zh_list.extend(names_list)
names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list
names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list
names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list
summary_groups = [
{'name': key, 'subsets': value} for key, value in names_dict.items()
]
summary_groups.append({
'name': 'NeedleBench-Overall-Score',
'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'],
['Multi-Needle-Reasoning(M-RS)', 'naive_average'],
['Multi-Needle-Retrieval(M-RT)', 'average_score']],
'weights': {'Single-Needle-Retrieval(S-RT)': 0.4,
'Multi-Needle-Reasoning(M-RS)': 0.3,
'Multi-Needle-Retrieval(M-RT)': 0.3}})
summarizer_config = {
'type': NeedleBenchSummarizer,
'summary_groups': summary_groups,
'dataset_abbrs': [
'NeedleBench-Overall-Score',
f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------',
'Single-Needle-Retrieval(S-RT)',
'Single-Needle-Retrieval-EN',
'Single-Needle-Retrieval-ZH',
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------',
'Multi-Needle-Retrieval(M-RT)',
'Multi-Needle-Retrieval-EN',
'Multi-Needle-Retrieval-ZH',
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------',
'Multi-Needle-Reasoning(M-RS)',
'Multi-Needle-Reasoning-EN',
'Multi-Needle-Reasoning-ZH',
'2-Needle-EN-4K',
'2-Needle-ZH-4K',
'3-Needle-EN-4K',
'3-Needle-ZH-4K',
'4-Needle-EN-4K',
'4-Needle-ZH-4K',
'5-Needle-EN-4K',
'5-Needle-ZH-4K',
]
}
return summarizer_config
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_4k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-4k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-4k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-4k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-8k-summarizer----------
depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100]
depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
context_lengths_4k = list(range(1000, 5000, 1000))
needlebench_4k_summarizer = create_summarizer(context_lengths_4k, depths, "4k")
context_lengths_8k = list(range(5000, 9000, 1000))
# Initialize the lists
_needlebench_8k_2needle_en = []
_needlebench_8k_3needle_en = []
_needlebench_8k_4needle_en = []
_needlebench_8k_5needle_en = []
_needlebench_8k_2needle_zh = []
_needlebench_8k_3needle_zh = []
_needlebench_8k_4needle_zh = []
_needlebench_8k_5needle_zh = []
_needlebench_8k_origin_en = []
_needlebench_8k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_8k:
for depth_percent in depths:
_needlebench_8k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_8k')
_needlebench_8k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_8k')
_needlebench_8k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_8k')
_needlebench_8k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_8k')
_needlebench_8k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_8k')
_needlebench_8k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_8k')
_needlebench_8k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_8k')
_needlebench_8k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_8k')
_needlebench_8k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_8k')
_needlebench_8k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_8k')
# Concatenate the multi-needle and origin lists
_needlebench_8k_multi_needle_en = _needlebench_8k_2needle_en + _needlebench_8k_3needle_en + _needlebench_8k_4needle_en + _needlebench_8k_5needle_en
_needlebench_8k_multi_needle_zh = _needlebench_8k_2needle_zh + _needlebench_8k_3needle_zh + _needlebench_8k_4needle_zh + _needlebench_8k_5needle_zh
_needlebench_8k_origin = _needlebench_8k_origin_en + _needlebench_8k_origin_zh
_needlebench_8k_multi_needle = _needlebench_8k_multi_needle_en + _needlebench_8k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en = []
_needlebench_8k_parallel_zh = []
for original_context_length in context_lengths_8k:
_needlebench_8k_parallel_en.append(f'Length{original_context_length}_parallel_en_8k')
for original_context_length in context_lengths_8k:
_needlebench_8k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_8k')
_needlebench_8k_parallel = _needlebench_8k_parallel_en + _needlebench_8k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_8k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_8k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_8k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_8k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_8k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_8k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_8k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_8k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_8k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_8k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_8k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_8k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_8k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_8k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_8k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_8k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_8k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_8k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-8k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-8k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-8k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-32k-summarizer----------
needlebench_8k_summarizer = create_summarizer(context_lengths_8k, depths, "8k")
context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]
# Initialize the lists
_needlebench_32k_2needle_en = []
_needlebench_32k_3needle_en = []
_needlebench_32k_4needle_en = []
_needlebench_32k_5needle_en = []
_needlebench_32k_2needle_zh = []
_needlebench_32k_3needle_zh = []
_needlebench_32k_4needle_zh = []
_needlebench_32k_5needle_zh = []
_needlebench_32k_origin_en = []
_needlebench_32k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_32k:
for depth_percent in depths_list_sparse:
_needlebench_32k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_32k')
_needlebench_32k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_32k')
_needlebench_32k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_32k')
_needlebench_32k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_32k')
_needlebench_32k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_32k')
_needlebench_32k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_32k')
_needlebench_32k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_32k')
_needlebench_32k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_32k')
_needlebench_32k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_32k')
_needlebench_32k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_32k')
# Concatenate the multi-needle and origin lists
_needlebench_32k_multi_needle_en = _needlebench_32k_2needle_en + _needlebench_32k_3needle_en + _needlebench_32k_4needle_en + _needlebench_32k_5needle_en
_needlebench_32k_multi_needle_zh = _needlebench_32k_2needle_zh + _needlebench_32k_3needle_zh + _needlebench_32k_4needle_zh + _needlebench_32k_5needle_zh
_needlebench_32k_origin = _needlebench_32k_origin_en + _needlebench_32k_origin_zh
_needlebench_32k_multi_needle = _needlebench_32k_multi_needle_en + _needlebench_32k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_32k_parallel_en = []
_needlebench_32k_parallel_zh = []
for original_context_length in context_lengths_32k:
_needlebench_32k_parallel_en.append(f'Length{original_context_length}_parallel_en_32k')
for original_context_length in context_lengths_32k:
_needlebench_32k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_32k')
_needlebench_32k_parallel = _needlebench_32k_parallel_en + _needlebench_32k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_32k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_32k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_32k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_32k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_32k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_32k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_32k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_32k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_32k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_32k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_32k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_32k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_32k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_32k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_32k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_32k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_32k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_32k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-32k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-32k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-32k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-128k-summarizer----------
needlebench_32k_summarizer = create_summarizer(context_lengths_32k, depths_list_sparse, "32k")
context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
# Initialize the lists
_needlebench_128k_2needle_en = []
_needlebench_128k_3needle_en = []
_needlebench_128k_4needle_en = []
_needlebench_128k_5needle_en = []
_needlebench_128k_2needle_zh = []
_needlebench_128k_3needle_zh = []
_needlebench_128k_4needle_zh = []
_needlebench_128k_5needle_zh = []
_needlebench_128k_origin_en = []
_needlebench_128k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_128k:
for depth_percent in depths_list_sparse:
_needlebench_128k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_128k')
_needlebench_128k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_128k')
_needlebench_128k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_128k')
_needlebench_128k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_128k')
_needlebench_128k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_128k')
_needlebench_128k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_128k')
_needlebench_128k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_128k')
_needlebench_128k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_128k')
_needlebench_128k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_128k')
_needlebench_128k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_128k')
# Concatenate the multi-needle and origin lists
_needlebench_128k_multi_needle_en = _needlebench_128k_2needle_en + _needlebench_128k_3needle_en + _needlebench_128k_4needle_en + _needlebench_128k_5needle_en
_needlebench_128k_multi_needle_zh = _needlebench_128k_2needle_zh + _needlebench_128k_3needle_zh + _needlebench_128k_4needle_zh + _needlebench_128k_5needle_zh
_needlebench_128k_origin = _needlebench_128k_origin_en + _needlebench_128k_origin_zh
_needlebench_128k_multi_needle = _needlebench_128k_multi_needle_en + _needlebench_128k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_128k_parallel_en = []
_needlebench_128k_parallel_zh = []
for original_context_length in context_lengths_128k:
_needlebench_128k_parallel_en.append(f'Length{original_context_length}_parallel_en_128k')
for original_context_length in context_lengths_128k:
_needlebench_128k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_128k')
_needlebench_128k_parallel = _needlebench_128k_parallel_en + _needlebench_128k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_128k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_128k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_128k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_128k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_128k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_128k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_128k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_128k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_128k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_128k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_128k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_128k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_128k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_128k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_128k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_128k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_128k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_128k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-128k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-128k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-128k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-200k-summarizer----------
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k")
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
# Initialize the lists
_needlebench_200k_2needle_en = []
_needlebench_200k_3needle_en = []
_needlebench_200k_4needle_en = []
_needlebench_200k_5needle_en = []
_needlebench_200k_2needle_zh = []
_needlebench_200k_3needle_zh = []
_needlebench_200k_4needle_zh = []
_needlebench_200k_5needle_zh = []
_needlebench_200k_origin_en = []
_needlebench_200k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_200k:
for depth_percent in depths_list_sparse:
_needlebench_200k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_200k')
_needlebench_200k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_200k')
_needlebench_200k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_200k')
_needlebench_200k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_200k')
_needlebench_200k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_200k')
_needlebench_200k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_200k')
_needlebench_200k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_200k')
_needlebench_200k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_200k')
_needlebench_200k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_200k')
_needlebench_200k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_200k')
# Concatenate the multi-needle and origin lists
_needlebench_200k_multi_needle_en = _needlebench_200k_2needle_en + _needlebench_200k_3needle_en + _needlebench_200k_4needle_en + _needlebench_200k_5needle_en
_needlebench_200k_multi_needle_zh = _needlebench_200k_2needle_zh + _needlebench_200k_3needle_zh + _needlebench_200k_4needle_zh + _needlebench_200k_5needle_zh
_needlebench_200k_origin = _needlebench_200k_origin_en + _needlebench_200k_origin_zh
_needlebench_200k_multi_needle = _needlebench_200k_multi_needle_en + _needlebench_200k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_200k_parallel_en = []
_needlebench_200k_parallel_zh = []
for original_context_length in context_lengths_200k:
_needlebench_200k_parallel_en.append(f'Length{original_context_length}_parallel_en_200k')
for original_context_length in context_lengths_200k:
_needlebench_200k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_200k')
_needlebench_200k_parallel = _needlebench_200k_parallel_en + _needlebench_200k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_200k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_200k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_200k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_200k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_200k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_200k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_200k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_200k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_200k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_200k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_200k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_200k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_200k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_200k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_200k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_200k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_200k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_200k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-200k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-200k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-200k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-1000k-summarizer----------
needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k")
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
# Initialize the lists
_needlebench_1000k_2needle_en = []
_needlebench_1000k_3needle_en = []
_needlebench_1000k_4needle_en = []
_needlebench_1000k_5needle_en = []
_needlebench_1000k_2needle_zh = []
_needlebench_1000k_3needle_zh = []
_needlebench_1000k_4needle_zh = []
_needlebench_1000k_5needle_zh = []
_needlebench_1000k_origin_en = []
_needlebench_1000k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_1000k:
for depth_percent in depths_list_sparse:
_needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k')
_needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k')
_needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k')
_needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k')
_needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k')
_needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k')
_needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k')
_needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k')
_needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k')
_needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k')
# Concatenate the multi-needle and origin lists
_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en
_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh
_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh
_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_1000k_parallel_en = []
_needlebench_1000k_parallel_zh = []
for original_context_length in context_lengths_1000k:
_needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k')
for original_context_length in context_lengths_1000k:
_needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k')
_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k")
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_1000k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_1000k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-1000k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-1000k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-1000k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel,
],
summary_groups=needlebench_summary_groups,
)
context_lengths_8k = list(range(5000, 9000, 1000))
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en_batch1 = []
_needlebench_8k_parallel_en_batch5 = []
_needlebench_8k_parallel_en_batch10 = []
......@@ -713,7 +202,6 @@ needlebench_8k_batch_overall_summarizer = dict(
'parallel_version_en_batch15',
'parallel_version_zh_batch20',
'parallel_version_en_batch20',
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups=needlebench_summary_groups,
)
......@@ -754,64 +242,72 @@ needlebench_8k_batch_depth0_summarizer = dict(
'parallel_version_en_batch15',
'parallel_version_zh_batch20',
'parallel_version_en_batch20',
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups=needlebench_summary_groups,
)
needle_num_list = list(range(2, 20, 1))
categories = ['ZH', 'EN', 'ZH-Reasoning', 'EN-Reasoning', 'ZH-CircularEval', 'EN-CircularEval', 'ZH-Reasoning-Circular', 'EN-Reasoning-Circular']
needlebench_atc_summary_groups = []
for category in categories:
metric = 'perf_4' if 'CircularEval' in category else 'acc_1'
cleaned_category = category.replace('-CircularEval', '').replace('-Circular', '')
subsets = [f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}'
for num_needles in needle_num_list]
def gen_atc_summarizer(needle_num_list):
categories = [
'ZH-Direct-CE', 'EN-Direct-CE',
'ZH-Reasoning-CE', 'EN-Reasoning-CE'
]
needlebench_atc_summary_groups = []
# 根据分类生成summary groups
for category in categories:
# 对于CircularEval相关的评分,使用perf_4指标,否则使用acc_1指标
metric = 'perf_4' if 'CE' in category else 'acc_1'
# 生成subsets时,不需要在数据集名称中包含CircularEval信息
cleaned_category = category.replace('-CE', '').replace('-Direct', '')
needlebench_atc_summary_groups.append({
'name': category,
'subsets': [
[f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}', metric]
for num_needles in needle_num_list
],
'weights': {f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}': num_needles for num_needles in needle_num_list},
})
needlebench_atc_summary_groups.append({
'name': category,
'name': 'ATC-CE-Overall',
'subsets': [
[f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}',
metric]
for num_needles in needle_num_list
]
})
atc_dataset_abbrs = []
for category in categories:
title = f'######## Needlebench-ATC-{category}-Score ########'
atc_dataset_abbrs.append(title)
weighted_average_score_entry = [f'{category}', 'weighted_average']
atc_dataset_abbrs.append(weighted_average_score_entry)
if atc_dataset_abbrs[-1] == '------------------------------------------':
atc_dataset_abbrs.pop()
needlebench_atc_summarizer = dict(
dataset_abbrs=[
*atc_dataset_abbrs,
'######## Needlebench-ATC Accuracy ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
'######## Needlebench-ATC CircularEval ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
],
summary_groups=needlebench_atc_summary_groups
)
[f'{category}', 'weighted_average'] for category in categories
],
})
atc_dataset_abbrs = []
atc_dataset_abbrs.append(['ATC-CE-Overall', 'naive_average'])
for category in categories:
weighted_average_score_entry = [f'{category}', 'weighted_average']
atc_dataset_abbrs.append(weighted_average_score_entry)
needlebench_atc_summarizer = dict(
dataset_abbrs=[
*atc_dataset_abbrs,
'######## Needlebench-ATC Accuracy ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
'######## Needlebench-ATC CircularEval ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
],
summary_groups=needlebench_atc_summary_groups
)
return needlebench_atc_summarizer
atc_summarizer_20 = gen_atc_summarizer(list(range(2, 20, 1)))
atc_summarizer_50 = gen_atc_summarizer(list(range(2, 50, 1)))
atc_summarizer_80 = gen_atc_summarizer(list(range(2, 80, 1)))
......@@ -5,6 +5,7 @@ import getpass
import math
import os
import os.path as osp
import shutil
from datetime import datetime
from typing import Any, Dict, List, Optional
......@@ -26,6 +27,92 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
model_abbr_from_cfg)
from opencompass.utils.prompt import get_prompt_hash
model_name_mapping = {
'llama-2-7b-chat-hf': 'LLaMA-2-7B',
'llama-2-13b-chat-hf': 'LLaMA-2-13B',
'llama-2-70b-chat-hf': 'LLaMA-2-70B',
'baichuan2-7b-chat-hf': 'Baichuan2-7B',
'baichuan2-13b-chat-hf': 'Baichuan2-13B',
'yi-6b-chat-hf': 'Yi-6B',
'yi-34b-chat-hf': 'Yi-34B',
'deepseek-67b-chat-hf': 'DeepSeek-67B',
'wizardlm-70b-v1.0-vllm': 'WizardLM-70B',
'qwen-14b-chat-hf': 'Qwen-14B',
'qwen-72b-chat-hf': 'Qwen-72B',
'qwen-72b-chat-vllm': 'Qwen-72B-vLLM',
'internlm2-chat-7b-turbomind': 'InternLM2-7B-200K',
'internlm2-chat-20b-turbomind': 'InternLM2-20B-200K',
'internlm2-chat-7b-hf': 'InternLM2-7B',
'internlm2-chat-20b-hf': 'InternLM2-20B',
'qwen-7b-chat-hf': 'Qwen-7B',
'chatglm3-6b-hf': 'ChatGLM3-6B',
'chatglm3-6b-32k-hf': 'ChatGLM3-6B-32K',
'zephyr-7b-beta-vllm': 'Zephyr-7B Beta',
'mistral-7b-instruct-v0.2-vllm': 'Mistral-7B Inst. v0.2',
'mistral-7b-instruct-v0.1-vllm': 'Mistral-7B Inst. v0.1',
'mixtral-8x7b-instruct-v0.1-vllm': 'Mixtral-8x7B Inst. v0.1',
'orionstar-yi-34b-chat-hf': 'OrionStar-Yi-34B',
'orionstar-14b-long-chat-vllm': 'Orion-14B-LongChat',
'internlm-chat-7b-hf': 'InternLM-7B',
'gemma-2b-it-hf': 'Gemma-2B',
'gemma-7b-it-hf': 'Gemma-7B',
'qwen1.5-0.5b-chat-hf': 'Qwen-1.5-0.5B',
'qwen1.5-1.8b-chat-hf': 'Qwen-1.5-1.8B',
'qwen1.5-4b-chat-hf': 'Qwen-1.5-4B',
'qwen1.5-14b-chat-hf': 'Qwen-1.5-14B',
'qwen1.5-72b-chat-hf': 'Qwen-1.5-72B',
'qwen1.5-14b-chat-vllm': 'Qwen-1.5-14B-vLLM',
'qwen1.5-72b-chat-vllm': 'Qwen-1.5-72B-vLLM',
'glm4_notools': 'GLM-4',
'claude-3-opus': 'Claude-3-Opus',
# Add more mappings as necessary
}
dataset_mapping_dict = {}
needle_counts = ['2', '3', '4', '5']
languages = ['en', 'zh']
sizes = ['4k', '8k', '32k', '200k', '1000k']
types = ['origin', 'parallel']
for needle_count in needle_counts:
for language in languages:
for size in sizes:
key = f'{needle_count}needle_{language}_{size}'
value = f'{needle_count}-Needle-Reasoning-{language.upper()}-{size.upper()}'
dataset_mapping_dict[key] = value
for t in types:
for language in languages:
for size in sizes:
if t == 'origin':
key = f'{t}_{language}_{size}'
value = f'Single-Needle-Retrieval-{language.upper()}-{size.upper()}'
elif t == 'parallel':
key = f'{t}_{language}_{size}'
value = f'Multi-Needle-Retrieval-{language.upper()}-{size.upper()}'
dataset_mapping_dict[key] = value
def calculate_elementwise_average(model_name, merged_df):
score_columns = [col for col in merged_df.columns if col != 'dataset']
origin_columns = [col for col in score_columns if 'origin' in col]
parallel_columns = [col for col in score_columns if 'parallel' in col]
multi_columns = [col for col in score_columns if 'needle' in col]
if origin_columns and parallel_columns and multi_columns:
origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4
parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3
multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3
merged_df[model_name] = origin_avg + parallel_avg + multi_avg
else:
relevant_columns = origin_columns or parallel_columns or multi_columns
if relevant_columns:
merged_df[model_name] = merged_df[relevant_columns].mean(axis=1)
else:
merged_df[model_name] = pd.Series([0] * len(merged_df))
return merged_df.iloc[:, [0, -1]]
def read_after_specific_line_except_last(file_name, keyword, offset):
with open(file_name, 'r', encoding='utf-8') as file:
......@@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False
df = pd.DataFrame(data, columns=['dataset', model_name])
return df
def convert_to_k(value):
try:
return f'{int(value) // 1000}k'
except ValueError:
return value
def parse_model_scores(text):
lines = text.split('\n')
......@@ -82,8 +175,86 @@ def parse_model_scores(text):
return result_dict
def remove_empty_subfolders(plot_path):
for folder_name in tqdm(os.listdir(plot_path),
desc='Deleting Empty folders'):
folder_path = os.path.join(plot_path, folder_name)
if os.path.isdir(folder_path):
if not os.listdir(folder_path):
shutil.rmtree(folder_path)
def save_results_to_plots(txt_results_save_path):
content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2)
parsed_data = parse_model_scores(content)
model_names = get_dict_model_names(parsed_data)
numbers = [2, 3, 4, 5]
languages = ['en', 'zh']
size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k']
for size in sizes_origin:
if size in content:
size_exists.append(size)
multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists]
origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists]
parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists]
dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \
parallel_dataset_abbrs
base_path = os.path.dirname(txt_results_save_path)
plot_path = os.path.join(base_path, 'plots')
model_scores = {}
for model_name in tqdm(model_names):
model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model
for dataset_abbr in dataset_abbrs:
parallel_flag = 'parallel' in dataset_abbr
folder_path = os.path.join(plot_path, dataset_mapping_dict[dataset_abbr])
ensure_directory(folder_path)
save_path = os.path.join(folder_path, f'{model_name}.png')
df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag)
score = visualize(df, save_path, model_name, dataset_abbr)
model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score)
overall_dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + parallel_dataset_abbrs
overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png')
merged_df = merge_dataframes(model_name, overall_dataset_abbrs, parsed_data)
averaged_df = calculate_elementwise_average(model_name, merged_df)
overall_score = visualize(averaged_df, overall_score_pic_path, model_name, 'Overall Score')
# Single-Retrieval
single_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_single_retrieval_overall.png')
single_retrieval_merged_df = merge_dataframes(model_name, origin_dataset_abbrs, parsed_data)
single_retrieval_averaged_df = calculate_elementwise_average(model_name, single_retrieval_merged_df)
single_retrieval_overall_score = visualize(single_retrieval_averaged_df, single_retrieval_score_pic_path, model_name, 'Single-Retrieval Overall Score')
# Multi-Retrieval
multi_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_retrieval_overall.png')
multi_retrieval_merged_df = merge_dataframes(model_name, parallel_dataset_abbrs, parsed_data)
multi_retrieval_averaged_df = calculate_elementwise_average(model_name, multi_retrieval_merged_df)
multi_retrieval_overall_score = visualize(multi_retrieval_averaged_df, multi_retrieval_score_pic_path, model_name, 'Multi-Retrieval Overall Score')
# Multi-Reasoning
multi_reasoning_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_reasoning_overall.png')
multi_reasoning_merged_df = merge_dataframes(model_name, multi_dataset_abbrs, parsed_data)
multi_reasoning_averaged_df = calculate_elementwise_average(model_name, multi_reasoning_merged_df)
multi_reasoning_overall_score = visualize(multi_reasoning_averaged_df, multi_reasoning_score_pic_path, model_name, 'Multi-Reasoning Overall Score')
model_scores[model_name] = averaged_df
remove_empty_subfolders(plot_path)
return model_scores
def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
df = df_raw.copy()
if df.empty:
return -1
df['Context Length'] = df['dataset'].apply(
lambda x: int(x.split('Length')[1].split('Depth')[0]))
df['Document Depth'] = df['dataset'].apply(
......@@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
model_df = df[['Document Depth', 'Context Length',
model_name]].copy()
model_df.rename(columns={model_name: 'Score'}, inplace=True)
# Create pivot table
pivot_table = pd.pivot_table(model_df,
values='Score',
index=['Document Depth'],
columns=['Context Length'],
aggfunc='mean')
values='Score',
index=['Document Depth'],
columns=['Context Length'],
aggfunc='mean')
# Calculate mean scores
mean_scores = pivot_table.mean().values
# Calculate overall score
overall_score = mean_scores.mean()
# Create heatmap and line plot
plt.figure(figsize=(15.5, 8))
plt.figure(figsize=(10, 6))
ax = plt.gca()
cmap = LinearSegmentedColormap.from_list(
'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
# Draw heatmap
sns.heatmap(pivot_table,
cmap=cmap,
ax=ax,
cbar_kws={'label': 'Score'},
vmin=0,
vmax=100)
# Set line plot data
cbar = ax.collections[0].colorbar
x_data = [i + 0.5 for i in range(len(mean_scores))]
y_data = mean_scores
# Create twin axis for line plot
ax2 = ax.twinx()
# Draw line plot
ax2.plot(x_data,
y_data,
color='white',
marker='o',
linestyle='-',
linewidth=2,
markersize=8,
label='Average Depth Score')
# Set y-axis range
y_data,
color='white',
marker='o',
linestyle='-',
linewidth=2,
markersize=8,
label='Average Depth Score'
)
ax2.set_ylim(0, 100)
# Hide original y-axis ticks and labels
ax2.set_yticklabels([])
ax2.set_yticks([])
# Add legend
ax2.legend(loc='upper left')
# Set chart title and labels
ax.set_title(f'{model_name} {dataset_type} Context '
'Performance\nFact Retrieval Across '
'Context Lengths ("Needle In A Haystack")')
ax.set_xlabel('Token Limit')
ax.set_ylabel('Depth Percent')
ax.set_xticklabels(pivot_table.columns.values, rotation=45)
ax.set_yticklabels(pivot_table.index.values, rotation=0)
# Add overall score as a subtitle
plt.text(0.5,
-0.13, f'Overall Score for {model_name}: '
f'{overall_score:.2f}',
ha='center',
va='center',
transform=ax.transAxes,
fontsize=13)
plt.tight_layout()
plt.subplots_adjust(right=1)
plt.draw()
plt.savefig(save_path)
print(f'Saved :{save_path}')
plt.close() # Close figure to prevent memory leaks
return overall_score
def save_results_to_plots(txt_results_save_path):
content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2)
parsed_data = parse_model_scores(content)
model_names = get_dict_model_names(parsed_data)
numbers = [2, 3, 4, 5]
languages = ['en', 'zh']
size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k']
for size in sizes_origin:
if size in content:
size_exists.append(size)
multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists]
origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists]
parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists]
dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \
parallel_dataset_abbrs
base_path = os.path.dirname(txt_results_save_path)
plot_path = os.path.join(base_path, 'plots')
model_scores = {}
for model_name in tqdm(model_names):
model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model
for dataset_abbr in dataset_abbrs:
parallel_flag = 'parallel' in dataset_abbr
ax2.legend(loc='lower left')
# Create a directory for each dataset_abbr
folder_path = os.path.join(plot_path, dataset_abbr)
ensure_directory(folder_path)
if model_name in model_name_mapping:
title_name = model_name_mapping[model_name]
else:
title_name = model_name
# Construct the full path to save the image
save_path = os.path.join(folder_path, f'{model_name}.png')
ax.set_title(title_name, fontsize=12, fontweight='bold', pad=15)
# Create DataFrame for the model and dataset
df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag)
if dataset_type in dataset_mapping_dict:
dataset_name = dataset_mapping_dict[dataset_type]
else:
dataset_name = dataset_type
ax.text(0.5, 1.005, f'{dataset_name}:{overall_score:.2f}',
transform=ax.transAxes,
ha='center',
fontsize=12,
fontweight='normal')
ax.set_xlabel('Token Length', fontsize=13, fontweight='normal', labelpad=1)
ax.set_ylabel('Depth Percent(%)', fontsize=13, fontweight='normal', labelpad=1)
converted_labels = [convert_to_k(value) for value in pivot_table.columns.values]
ax.tick_params(axis='both', which='major', length=1, pad=1)
ax.tick_params(axis='both', which='minor', length=1, pad=1)
ax.set_xticklabels(converted_labels, rotation=45)
index_length = len(pivot_table.index)
selected_indices = pivot_table.index.values[::2]
labels = [str(int(index)) for index in selected_indices]
ax.set_yticks(np.arange(0, len(pivot_table.index), 2))
ax.set_yticklabels(labels, rotation=0)
for spine in ax.spines.values():
spine.set_visible(False)
for spine in ax2.spines.values():
spine.set_visible(False)
# Generate visualization and get the score
score = visualize(df, save_path, model_name, dataset_abbr)
plt.tight_layout()
plt.draw()
directory_path, original_filename = os.path.split(save_path)
# Store the score in the dictionary
model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score)
filename_suffix = (title_name+'_'+dataset_name).replace(' ', '_')
new_filename = f'{filename_suffix}.png'
# Process and visualize the overall score
overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png')
merged_df = merge_dataframes(model_name, dataset_abbrs, parsed_data)
new_save_path = os.path.join(directory_path, new_filename)
print(merge_dataframes)
averaged_df = calculate_elementwise_average(merged_df)
plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0)
print(f'Saved :{new_save_path}')
# Assume visualize returns the average score for the overall visualization
overall_score = visualize(averaged_df, overall_score_pic_path, 'weighted_average_score', 'Overall Score')
plt.close()
# Add the overall score to the dictionary
model_datasets_scores['Overall'] = '{:.02f}'.format(overall_score)
return overall_score
# Add the model's scores to the main dictionary
model_scores[model_name] = model_datasets_scores
def ensure_directory(path):
if not os.path.exists(path):
......@@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data):
merged_df = reduce(lambda left, right: pd.merge(left, right, on='dataset', how='outer'), dfs)
if merged_df.isnull().any().any():
print('Warning: Some rows were filtered out due to NaN values. This is often due to mismatched row counts among DataFrames.')
print('Warning: Some rows were filtered out due to NaN values. '
'This is often due to mismatched row counts among DataFrames.')
merged_df = merged_df.dropna()
return merged_df
def calculate_elementwise_average(merged_df):
score_columns = [col for col in merged_df.columns if col != 'dataset']
origin_columns = [col for col in score_columns if 'origin' in col]
parallel_columns = [col for col in score_columns if 'parallel' in col]
multi_columns = [col for col in score_columns if 'needle' in col]
if origin_columns and parallel_columns and multi_columns:
origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4
parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3
multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3
merged_df['weighted_average_score'] = origin_avg + parallel_avg + multi_avg
else:
merged_df['weighted_average_score'] = pd.Series([0] * len(merged_df))
return merged_df.iloc[:, [0, -1]]
class NeedleBenchSummarizer(DefaultSummarizer):
"""NeedleBench summarizer in OpenCompass.
......@@ -303,20 +408,17 @@ class NeedleBenchSummarizer(DefaultSummarizer):
summarizer_dataset_abbrs = []
if self.dataset_abbrs is None:
# display all dataset metrics included in the config
for dataset_abbr in dataset_abbrs:
if dataset_abbr in dataset_metrics:
for metric in dataset_metrics[dataset_abbr]:
summarizer_dataset_abbrs.append((dataset_abbr, metric))
else:
summarizer_dataset_abbrs.append((dataset_abbr, None))
# along with all possible group metrics
for dataset_abbr in dataset_metrics:
for metric in dataset_metrics[dataset_abbr]:
if (dataset_abbr, metric) not in summarizer_dataset_abbrs:
summarizer_dataset_abbrs.append((dataset_abbr, metric))
else:
# follow the required order
for item in self.dataset_abbrs:
if isinstance(item, str):
summarizer_dataset_abbrs.append((item, None))
......@@ -332,6 +434,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
for dataset_abbr, metric in summarizer_dataset_abbrs:
if dataset_abbr not in dataset_metrics:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
table.append(header)
continue
......@@ -378,33 +481,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
raw_txts = '\n'.join(raw_txts)
return raw_txts
def _read_and_sort_dataframe(self, file_path):
# Read the file without treating the first row as a header
df = pd.read_csv(file_path, header=None)
# Function to sort columns based on the value of a specific row, excluding the first column
def sort_columns_based_on_row_corrected(df, base_row_idx, start_row_idx, end_row_idx):
# Extract the rows for sorting
sort_values_row = df.iloc[base_row_idx, 1:].replace('-', np.nan).apply(pd.to_numeric, errors='coerce')
# Handle NaNs by setting them to a value less than the minimum or using a method to keep them at the end
min_possible_value = sort_values_row.min(skipna=True) - 1 # Use min value in the row minus 1 or another method
sort_values_row_filled = sort_values_row.fillna(min_possible_value)
# Get the sorted order of indices, excluding the first column
sorted_col_indices = sort_values_row_filled.sort_values(ascending=False).index
# Apply the sorted column indices to the whole DataFrame, adjusting for Python's 0-based index
df.iloc[start_row_idx:end_row_idx+1] = df.iloc[start_row_idx:end_row_idx+1, [0] + sorted_col_indices.tolist()]
# Apply the corrected sorting function based on the description
sort_columns_based_on_row_corrected(df, 1, 0, 2) # For rows 1-2 based on row 2's values
sort_columns_based_on_row_corrected(df, 4, 3, 7) # For rows 4-7 based on row 5's values
sort_columns_based_on_row_corrected(df, 9, 8, 12) # For rows 9-12 based on row 10's values
sort_columns_based_on_row_corrected(df, 14, 13, 25) # For rows 14-25 based on row 15's values
# Return the sorted DataFrame
return df
def _output_to_file(self, output_path, time_str, table, raw_txts):
# output to file
if output_path is None:
output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt')
output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv')
......@@ -436,38 +513,19 @@ class NeedleBenchSummarizer(DefaultSummarizer):
f.write('\n'.join([','.join(row) for row in table]) + '\n')
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
df_sorted = self._read_and_sort_dataframe(output_csv_path)
sorted_file_path = osp.abspath(output_csv_path).split('.')[0] + '_sorted.csv'
df_sorted.to_csv(sorted_file_path, index=False, header=False)
self.logger.info(f'write sorted csv to {sorted_file_path}')
def summarize(
self,
output_path: str = None,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa
# pick up results
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results()
# calculate group metrics
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \
self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode)
# format table
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode)
# format raw txt
raw_txts = self._format_raw_txt(raw_results)
# output to screen
print(tabulate.tabulate(table, headers='firstrow'))
# output to .text / .csv files
self._output_to_file(output_path, time_str, table, raw_txts)
if self.lark_reporter:
content = f'{getpass.getuser()} 的'
content += f'详细评测汇总已输出至 {osp.abspath(output_path)}'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment