Unverified Commit 16f29b25 authored by Mo Li's avatar Mo Li Committed by GitHub
Browse files

[Fix] Simplify needlebench summarizer (#1024)

* Conflicts:
	configs/summarizers/needlebench.py

* fix lint problems
parent f2af4933
from mmengine.config import read_base
with read_base():
from .atc_choice_20 import *
needle_num_list = list(range(2, 50, 1))
needlebench_datasets = []
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = "English" if "en" in _name else "Chinese"
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': names_path,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)
from mmengine.config import read_base
with read_base():
from .atc_choice_20 import *
needle_num_list = list(range(2, 80, 1))
needlebench_datasets = []
for _name in list(single_choice_prompts.keys()):
needlebench_atc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=(single_choice_prompts[_name])),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer,),
)
needlebench_atc_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
for num_needles in needle_num_list:
abbr = (f'NeedleBenchATCDataset-'
f'{num_needles}Needle-{"EN" if "en" in _name else "ZH"}')
language = "English" if "en" in _name else "Chinese"
if 'reasoning' in _name:
abbr += '-Reasoning'
dataset_dict = {
'abbr': abbr,
'type': NeedleBenchATCDataset,
'path': names_path,
'num_needles': num_needles,
'language': language,
'repeats': repeats,
'with_circular': with_circular_eval,
'reader_cfg': needlebench_atc_reader_cfg,
'infer_cfg': needlebench_atc_infer_cfg,
'eval_cfg': needlebench_atc_eval_cfg
}
needlebench_datasets.append(dataset_dict)
from opencompass.summarizers.needlebench import NeedleBenchSummarizer from opencompass.summarizers.needlebench import NeedleBenchSummarizer
from opencompass.summarizers.needlebench import NeedleBenchATCSummarizer
# ----------NeedleBench-4k-summarizer----------
context_lengths_4k = list(range(1000, 5000, 1000))
depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100]
depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# Initialize the lists
_needlebench_4k_2needle_en = []
_needlebench_4k_3needle_en = []
_needlebench_4k_4needle_en = []
_needlebench_4k_5needle_en = []
_needlebench_4k_2needle_zh = []
_needlebench_4k_3needle_zh = []
_needlebench_4k_4needle_zh = []
_needlebench_4k_5needle_zh = []
_needlebench_4k_origin_en = []
_needlebench_4k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_4k:
for depth_percent in depths:
_needlebench_4k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_4k')
_needlebench_4k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_4k')
_needlebench_4k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_4k')
_needlebench_4k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_4k')
_needlebench_4k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_4k')
_needlebench_4k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_4k')
_needlebench_4k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_4k')
_needlebench_4k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_4k')
_needlebench_4k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_4k')
_needlebench_4k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_4k')
# Concatenate the multi-needle and origin lists
_needlebench_4k_multi_needle_en = _needlebench_4k_2needle_en + _needlebench_4k_3needle_en + _needlebench_4k_4needle_en + _needlebench_4k_5needle_en
_needlebench_4k_multi_needle_zh = _needlebench_4k_2needle_zh + _needlebench_4k_3needle_zh + _needlebench_4k_4needle_zh + _needlebench_4k_5needle_zh
_needlebench_4k_origin = _needlebench_4k_origin_en + _needlebench_4k_origin_zh
_needlebench_4k_multi_needle = _needlebench_4k_multi_needle_en + _needlebench_4k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_4k_parallel_en = []
_needlebench_4k_parallel_zh = []
for original_context_length in context_lengths_4k:
_needlebench_4k_parallel_en.append(f'Length{original_context_length}_parallel_en_4k')
for original_context_length in context_lengths_4k:
_needlebench_4k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_4k')
_needlebench_4k_parallel = _needlebench_4k_parallel_en + _needlebench_4k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_4k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_4k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_4k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_4k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_4k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_4k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_4k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_4k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_4k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_4k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_4k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_4k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_4k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_4k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_4k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_4k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_4k_parallel_en},
def create_m_rs_names_list(context_lengths, depths, needle_counts,
languages, dataset_size):
names_dict = {}
multi_needle_list = []
multi_needle_en_list = []
multi_needle_zh_list = []
for needle_count in needle_counts:
for language in languages:
key = f"{needle_count}-Needle-{language.upper()}-{dataset_size.upper()}"
names_list = [
f"Length{length}Depth{int(depth)}_{needle_count}needle_{language}_{dataset_size}"
for length in context_lengths
for depth in depths
]
names_dict[key] = names_list
multi_needle_list.extend(names_list)
if language == 'en':
multi_needle_en_list.extend(names_list)
elif language == 'zh':
multi_needle_zh_list.extend(names_list)
names_dict['Multi-Needle-Reasoning(M-RS)'] = multi_needle_list
names_dict['Multi-Needle-Reasoning-EN'] = multi_needle_en_list
names_dict['Multi-Needle-Reasoning-ZH'] = multi_needle_zh_list
return names_dict
def create_summarizer(context_lengths, depths, dataset_size,
sparse_depths=None):
needle_counts = ["2", "3", "4", "5"]
languages = ["en", "zh"]
if sparse_depths:
depths = sparse_depths
names_dict = {}
multi_reasoning_names = create_m_rs_names_list(
context_lengths, depths, needle_counts, languages, dataset_size)
names_dict.update(multi_reasoning_names)
single_needle_list = []
single_needle_en_list = []
single_needle_zh_list = []
for language in languages:
names_list = [
f"Length{length}Depth{int(depth)}_origin_{language}_{dataset_size}"
for length in context_lengths
for depth in depths
]
single_needle_list.extend(names_list)
if language == 'en':
single_needle_en_list.extend(names_list)
elif language == 'zh':
single_needle_zh_list.extend(names_list)
names_dict['Single-Needle-Retrieval(S-RT)'] = single_needle_list
names_dict['Single-Needle-Retrieval-EN'] = single_needle_en_list
names_dict['Single-Needle-Retrieval-ZH'] = single_needle_zh_list
parallel_list = []
parallel_en_list = []
parallel_zh_list = []
for language in languages:
names_list = [
f"Length{length}_parallel_{language}_{dataset_size}"
for length in context_lengths
]
parallel_list.extend(names_list)
if language == 'en':
parallel_en_list.extend(names_list)
elif language == 'zh':
parallel_zh_list.extend(names_list)
names_dict['Multi-Needle-Retrieval(M-RT)'] = parallel_list
names_dict['Multi-Needle-Retrieval-EN'] = parallel_en_list
names_dict['Multi-Needle-Retrieval-ZH'] = parallel_zh_list
summary_groups = [
{'name': key, 'subsets': value} for key, value in names_dict.items()
]
summary_groups.append({
'name': 'NeedleBench-Overall-Score',
'subsets': [['Single-Needle-Retrieval(S-RT)', 'naive_average'],
['Multi-Needle-Reasoning(M-RS)', 'naive_average'],
['Multi-Needle-Retrieval(M-RT)', 'average_score']],
'weights': {'Single-Needle-Retrieval(S-RT)': 0.4,
'Multi-Needle-Reasoning(M-RS)': 0.3,
'Multi-Needle-Retrieval(M-RT)': 0.3}})
summarizer_config = {
'type': NeedleBenchSummarizer,
'summary_groups': summary_groups,
'dataset_abbrs': [
'NeedleBench-Overall-Score',
f'--------- NeedleBench-{dataset_size.upper()}-Single-Needle-Retrieval ---------',
'Single-Needle-Retrieval(S-RT)',
'Single-Needle-Retrieval-EN',
'Single-Needle-Retrieval-ZH',
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Retrieval ---------',
'Multi-Needle-Retrieval(M-RT)',
'Multi-Needle-Retrieval-EN',
'Multi-Needle-Retrieval-ZH',
f'--------- NeedleBench-{dataset_size.upper()}-Multi-Needle-Reasoning ---------',
'Multi-Needle-Reasoning(M-RS)',
'Multi-Needle-Reasoning-EN',
'Multi-Needle-Reasoning-ZH',
'2-Needle-EN-4K',
'2-Needle-ZH-4K',
'3-Needle-EN-4K',
'3-Needle-ZH-4K',
'4-Needle-EN-4K',
'4-Needle-ZH-4K',
'5-Needle-EN-4K',
'5-Needle-ZH-4K',
]
}
return summarizer_config
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_4k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-4k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-4k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-4k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_4k_origin, *_needlebench_4k_multi_needle, *_needlebench_4k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-8k-summarizer---------- depths = [0, 5, 10, 15, 21, 26, 31, 36, 42, 47, 52, 57, 63, 68, 73, 78, 84, 89, 94, 100]
depths_list_sparse = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
context_lengths_4k = list(range(1000, 5000, 1000))
needlebench_4k_summarizer = create_summarizer(context_lengths_4k, depths, "4k")
context_lengths_8k = list(range(5000, 9000, 1000)) context_lengths_8k = list(range(5000, 9000, 1000))
needlebench_8k_summarizer = create_summarizer(context_lengths_8k, depths, "8k")
# Initialize the lists
_needlebench_8k_2needle_en = []
_needlebench_8k_3needle_en = []
_needlebench_8k_4needle_en = []
_needlebench_8k_5needle_en = []
_needlebench_8k_2needle_zh = []
_needlebench_8k_3needle_zh = []
_needlebench_8k_4needle_zh = []
_needlebench_8k_5needle_zh = []
_needlebench_8k_origin_en = []
_needlebench_8k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_8k:
for depth_percent in depths:
_needlebench_8k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_8k')
_needlebench_8k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_8k')
_needlebench_8k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_8k')
_needlebench_8k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_8k')
_needlebench_8k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_8k')
_needlebench_8k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_8k')
_needlebench_8k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_8k')
_needlebench_8k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_8k')
_needlebench_8k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_8k')
_needlebench_8k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_8k')
# Concatenate the multi-needle and origin lists
_needlebench_8k_multi_needle_en = _needlebench_8k_2needle_en + _needlebench_8k_3needle_en + _needlebench_8k_4needle_en + _needlebench_8k_5needle_en
_needlebench_8k_multi_needle_zh = _needlebench_8k_2needle_zh + _needlebench_8k_3needle_zh + _needlebench_8k_4needle_zh + _needlebench_8k_5needle_zh
_needlebench_8k_origin = _needlebench_8k_origin_en + _needlebench_8k_origin_zh
_needlebench_8k_multi_needle = _needlebench_8k_multi_needle_en + _needlebench_8k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en = []
_needlebench_8k_parallel_zh = []
for original_context_length in context_lengths_8k:
_needlebench_8k_parallel_en.append(f'Length{original_context_length}_parallel_en_8k')
for original_context_length in context_lengths_8k:
_needlebench_8k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_8k')
_needlebench_8k_parallel = _needlebench_8k_parallel_en + _needlebench_8k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_8k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_8k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_8k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_8k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_8k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_8k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_8k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_8k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_8k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_8k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_8k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_8k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_8k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_8k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_8k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_8k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_8k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_8k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-8k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-8k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-8k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-32k-summarizer----------
context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000] context_lengths_32k = [9000, 13000, 17000, 21000, 25000, 29000, 31000, 32000]
needlebench_32k_summarizer = create_summarizer(context_lengths_32k, depths_list_sparse, "32k")
# Initialize the lists
_needlebench_32k_2needle_en = []
_needlebench_32k_3needle_en = []
_needlebench_32k_4needle_en = []
_needlebench_32k_5needle_en = []
_needlebench_32k_2needle_zh = []
_needlebench_32k_3needle_zh = []
_needlebench_32k_4needle_zh = []
_needlebench_32k_5needle_zh = []
_needlebench_32k_origin_en = []
_needlebench_32k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_32k:
for depth_percent in depths_list_sparse:
_needlebench_32k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_32k')
_needlebench_32k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_32k')
_needlebench_32k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_32k')
_needlebench_32k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_32k')
_needlebench_32k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_32k')
_needlebench_32k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_32k')
_needlebench_32k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_32k')
_needlebench_32k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_32k')
_needlebench_32k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_32k')
_needlebench_32k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_32k')
# Concatenate the multi-needle and origin lists
_needlebench_32k_multi_needle_en = _needlebench_32k_2needle_en + _needlebench_32k_3needle_en + _needlebench_32k_4needle_en + _needlebench_32k_5needle_en
_needlebench_32k_multi_needle_zh = _needlebench_32k_2needle_zh + _needlebench_32k_3needle_zh + _needlebench_32k_4needle_zh + _needlebench_32k_5needle_zh
_needlebench_32k_origin = _needlebench_32k_origin_en + _needlebench_32k_origin_zh
_needlebench_32k_multi_needle = _needlebench_32k_multi_needle_en + _needlebench_32k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_32k_parallel_en = []
_needlebench_32k_parallel_zh = []
for original_context_length in context_lengths_32k:
_needlebench_32k_parallel_en.append(f'Length{original_context_length}_parallel_en_32k')
for original_context_length in context_lengths_32k:
_needlebench_32k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_32k')
_needlebench_32k_parallel = _needlebench_32k_parallel_en + _needlebench_32k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_32k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_32k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_32k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_32k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_32k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_32k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_32k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_32k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_32k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_32k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_32k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_32k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_32k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_32k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_32k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_32k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_32k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_32k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-32k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-32k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-32k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_32k_origin, *_needlebench_32k_multi_needle, *_needlebench_32k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-128k-summarizer----------
context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000]) context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000])
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k")
# Initialize the lists
_needlebench_128k_2needle_en = []
_needlebench_128k_3needle_en = []
_needlebench_128k_4needle_en = []
_needlebench_128k_5needle_en = []
_needlebench_128k_2needle_zh = []
_needlebench_128k_3needle_zh = []
_needlebench_128k_4needle_zh = []
_needlebench_128k_5needle_zh = []
_needlebench_128k_origin_en = []
_needlebench_128k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_128k:
for depth_percent in depths_list_sparse:
_needlebench_128k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_128k')
_needlebench_128k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_128k')
_needlebench_128k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_128k')
_needlebench_128k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_128k')
_needlebench_128k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_128k')
_needlebench_128k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_128k')
_needlebench_128k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_128k')
_needlebench_128k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_128k')
_needlebench_128k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_128k')
_needlebench_128k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_128k')
# Concatenate the multi-needle and origin lists
_needlebench_128k_multi_needle_en = _needlebench_128k_2needle_en + _needlebench_128k_3needle_en + _needlebench_128k_4needle_en + _needlebench_128k_5needle_en
_needlebench_128k_multi_needle_zh = _needlebench_128k_2needle_zh + _needlebench_128k_3needle_zh + _needlebench_128k_4needle_zh + _needlebench_128k_5needle_zh
_needlebench_128k_origin = _needlebench_128k_origin_en + _needlebench_128k_origin_zh
_needlebench_128k_multi_needle = _needlebench_128k_multi_needle_en + _needlebench_128k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_128k_parallel_en = []
_needlebench_128k_parallel_zh = []
for original_context_length in context_lengths_128k:
_needlebench_128k_parallel_en.append(f'Length{original_context_length}_parallel_en_128k')
for original_context_length in context_lengths_128k:
_needlebench_128k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_128k')
_needlebench_128k_parallel = _needlebench_128k_parallel_en + _needlebench_128k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_128k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_128k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_128k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_128k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_128k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_128k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_128k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_128k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_128k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_128k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_128k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_128k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_128k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_128k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_128k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_128k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_128k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_128k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-128k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-128k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-128k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_128k_origin, *_needlebench_128k_multi_needle, *_needlebench_128k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-200k-summarizer----------
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]) context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
# Initialize the lists needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k")
_needlebench_200k_2needle_en = []
_needlebench_200k_3needle_en = []
_needlebench_200k_4needle_en = []
_needlebench_200k_5needle_en = []
_needlebench_200k_2needle_zh = []
_needlebench_200k_3needle_zh = []
_needlebench_200k_4needle_zh = []
_needlebench_200k_5needle_zh = []
_needlebench_200k_origin_en = []
_needlebench_200k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_200k:
for depth_percent in depths_list_sparse:
_needlebench_200k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_200k')
_needlebench_200k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_200k')
_needlebench_200k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_200k')
_needlebench_200k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_200k')
_needlebench_200k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_200k')
_needlebench_200k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_200k')
_needlebench_200k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_200k')
_needlebench_200k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_200k')
_needlebench_200k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_200k')
_needlebench_200k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_200k')
# Concatenate the multi-needle and origin lists
_needlebench_200k_multi_needle_en = _needlebench_200k_2needle_en + _needlebench_200k_3needle_en + _needlebench_200k_4needle_en + _needlebench_200k_5needle_en
_needlebench_200k_multi_needle_zh = _needlebench_200k_2needle_zh + _needlebench_200k_3needle_zh + _needlebench_200k_4needle_zh + _needlebench_200k_5needle_zh
_needlebench_200k_origin = _needlebench_200k_origin_en + _needlebench_200k_origin_zh
_needlebench_200k_multi_needle = _needlebench_200k_multi_needle_en + _needlebench_200k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_200k_parallel_en = []
_needlebench_200k_parallel_zh = []
for original_context_length in context_lengths_200k:
_needlebench_200k_parallel_en.append(f'Length{original_context_length}_parallel_en_200k')
for original_context_length in context_lengths_200k:
_needlebench_200k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_200k')
_needlebench_200k_parallel = _needlebench_200k_parallel_en + _needlebench_200k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_200k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_200k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_200k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_200k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_200k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_200k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_200k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_200k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_200k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_200k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_200k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_200k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_200k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_200k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_200k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_200k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_200k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_200k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-200k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-200k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-200k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_200k_origin, *_needlebench_200k_multi_needle, *_needlebench_200k_parallel,
],
summary_groups=needlebench_summary_groups,
)
# ----------NeedleBench-1000k-summarizer----------
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
# Initialize the lists needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k")
_needlebench_1000k_2needle_en = []
_needlebench_1000k_3needle_en = []
_needlebench_1000k_4needle_en = []
_needlebench_1000k_5needle_en = []
_needlebench_1000k_2needle_zh = []
_needlebench_1000k_3needle_zh = []
_needlebench_1000k_4needle_zh = []
_needlebench_1000k_5needle_zh = []
_needlebench_1000k_origin_en = []
_needlebench_1000k_origin_zh = []
# Fill the lists using nested loops
for original_context_length in context_lengths_1000k:
for depth_percent in depths_list_sparse:
_needlebench_1000k_2needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_en_1000k')
_needlebench_1000k_3needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_en_1000k')
_needlebench_1000k_4needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_en_1000k')
_needlebench_1000k_5needle_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_en_1000k')
_needlebench_1000k_2needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_2needle_zh_1000k')
_needlebench_1000k_3needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_3needle_zh_1000k')
_needlebench_1000k_4needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_4needle_zh_1000k')
_needlebench_1000k_5needle_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_5needle_zh_1000k')
_needlebench_1000k_origin_en.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_en_1000k')
_needlebench_1000k_origin_zh.append(f'Length{original_context_length}Depth{int(depth_percent)}_origin_zh_1000k')
# Concatenate the multi-needle and origin lists
_needlebench_1000k_multi_needle_en = _needlebench_1000k_2needle_en + _needlebench_1000k_3needle_en + _needlebench_1000k_4needle_en + _needlebench_1000k_5needle_en
_needlebench_1000k_multi_needle_zh = _needlebench_1000k_2needle_zh + _needlebench_1000k_3needle_zh + _needlebench_1000k_4needle_zh + _needlebench_1000k_5needle_zh
_needlebench_1000k_origin = _needlebench_1000k_origin_en + _needlebench_1000k_origin_zh
_needlebench_1000k_multi_needle = _needlebench_1000k_multi_needle_en + _needlebench_1000k_multi_needle_zh
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_1000k_parallel_en = []
_needlebench_1000k_parallel_zh = []
for original_context_length in context_lengths_1000k:
_needlebench_1000k_parallel_en.append(f'Length{original_context_length}_parallel_en_1000k')
for original_context_length in context_lengths_1000k:
_needlebench_1000k_parallel_zh.append(f'Length{original_context_length}_parallel_zh_1000k')
_needlebench_1000k_parallel = _needlebench_1000k_parallel_en + _needlebench_1000k_parallel_zh
needlebench_summary_groups = [
{'name': 'original_version', 'subsets': _needlebench_1000k_origin},
{'name': 'original_version_zh', 'subsets': _needlebench_1000k_origin_zh},
{'name': 'original_version_en', 'subsets': _needlebench_1000k_origin_en},
{'name': 'multi_needle_en', 'subsets': _needlebench_1000k_multi_needle_en},
{'name': 'multi_needle2_en', 'subsets': _needlebench_1000k_2needle_en},
{'name': 'multi_needle3_en', 'subsets': _needlebench_1000k_3needle_en},
{'name': 'multi_needle4_en', 'subsets': _needlebench_1000k_4needle_en},
{'name': 'multi_needle5_en', 'subsets': _needlebench_1000k_5needle_en},
{'name': 'multi_needle_zh', 'subsets': _needlebench_1000k_multi_needle_zh},
{'name': 'multi_needle2_zh', 'subsets': _needlebench_1000k_2needle_zh},
{'name': 'multi_needle3_zh', 'subsets': _needlebench_1000k_3needle_zh},
{'name': 'multi_needle4_zh', 'subsets': _needlebench_1000k_4needle_zh},
{'name': 'multi_needle5_zh', 'subsets': _needlebench_1000k_5needle_zh},
{'name': 'multi_needle', 'subsets': _needlebench_1000k_multi_needle},
{'name': 'parallel_version', 'subsets': _needlebench_1000k_parallel},
{'name': 'parallel_version_zh', 'subsets': _needlebench_1000k_parallel_zh},
{'name': 'parallel_version_en', 'subsets': _needlebench_1000k_parallel_en},
{'name': 'overall',
'subsets': [['original_version', 'naive_average'],
['multi_needle', 'naive_average'],
['parallel_version', 'average_score']],
'weights': {'original_version': 0.4,
'multi_needle': 0.3,
'parallel_version': 0.3}},
]
needlebench_1000k_summarizer = dict(
type=NeedleBenchSummarizer,
dataset_abbrs=[
'overall',
'--------- NeedleBench-1000k Single-Needle ---------', # category
'original_version',
'original_version_zh',
'original_version_en',
'--------- NeedleBench-1000k Parallel-Needles ---------', # category
'parallel_version',
'parallel_version_zh',
'parallel_version_en',
'--------- NeedleBench-1000k Multi-Needles ---------', # category
'multi_needle',
'multi_needle_en',
'multi_needle_zh',
'multi_needle2_en',
'multi_needle3_en',
'multi_needle4_en',
'multi_needle5_en',
'multi_needle2_zh',
'multi_needle3_zh',
'multi_needle4_zh',
'multi_needle5_zh',
# *_needlebench_1000k_origin, *_needlebench_1000k_multi_needle, *_needlebench_1000k_parallel,
],
summary_groups=needlebench_summary_groups,
)
context_lengths_8k = list(range(5000, 9000, 1000))
# Repeating the same process for parallel (assuming it's similar to origin_en)
_needlebench_8k_parallel_en_batch1 = [] _needlebench_8k_parallel_en_batch1 = []
_needlebench_8k_parallel_en_batch5 = [] _needlebench_8k_parallel_en_batch5 = []
_needlebench_8k_parallel_en_batch10 = [] _needlebench_8k_parallel_en_batch10 = []
...@@ -713,7 +202,6 @@ needlebench_8k_batch_overall_summarizer = dict( ...@@ -713,7 +202,6 @@ needlebench_8k_batch_overall_summarizer = dict(
'parallel_version_en_batch15', 'parallel_version_en_batch15',
'parallel_version_zh_batch20', 'parallel_version_zh_batch20',
'parallel_version_en_batch20', 'parallel_version_en_batch20',
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
], ],
summary_groups=needlebench_summary_groups, summary_groups=needlebench_summary_groups,
) )
...@@ -754,64 +242,72 @@ needlebench_8k_batch_depth0_summarizer = dict( ...@@ -754,64 +242,72 @@ needlebench_8k_batch_depth0_summarizer = dict(
'parallel_version_en_batch15', 'parallel_version_en_batch15',
'parallel_version_zh_batch20', 'parallel_version_zh_batch20',
'parallel_version_en_batch20', 'parallel_version_en_batch20',
# *_needlebench_8k_origin, *_needlebench_8k_multi_needle, *_needlebench_8k_parallel,
], ],
summary_groups=needlebench_summary_groups, summary_groups=needlebench_summary_groups,
) )
needle_num_list = list(range(2, 20, 1)) def gen_atc_summarizer(needle_num_list):
categories = [
categories = ['ZH', 'EN', 'ZH-Reasoning', 'EN-Reasoning', 'ZH-CircularEval', 'EN-CircularEval', 'ZH-Reasoning-Circular', 'EN-Reasoning-Circular'] 'ZH-Direct-CE', 'EN-Direct-CE',
needlebench_atc_summary_groups = [] 'ZH-Reasoning-CE', 'EN-Reasoning-CE'
]
for category in categories: needlebench_atc_summary_groups = []
metric = 'perf_4' if 'CircularEval' in category else 'acc_1'
cleaned_category = category.replace('-CircularEval', '').replace('-Circular', '') # 根据分类生成summary groups
subsets = [f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}' for category in categories:
for num_needles in needle_num_list] # 对于CircularEval相关的评分,使用perf_4指标,否则使用acc_1指标
metric = 'perf_4' if 'CE' in category else 'acc_1'
# 生成subsets时,不需要在数据集名称中包含CircularEval信息
cleaned_category = category.replace('-CE', '').replace('-Direct', '')
needlebench_atc_summary_groups.append({
'name': category,
'subsets': [
[f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}', metric]
for num_needles in needle_num_list
],
'weights': {f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}': num_needles for num_needles in needle_num_list},
})
needlebench_atc_summary_groups.append({ needlebench_atc_summary_groups.append({
'name': category, 'name': 'ATC-CE-Overall',
'subsets': [ 'subsets': [
[f'NeedleBenchATCDataset-{num_needles}Needle-{cleaned_category}', [f'{category}', 'weighted_average'] for category in categories
metric] ],
for num_needles in needle_num_list })
] atc_dataset_abbrs = []
}) atc_dataset_abbrs.append(['ATC-CE-Overall', 'naive_average'])
atc_dataset_abbrs = [] for category in categories:
weighted_average_score_entry = [f'{category}', 'weighted_average']
for category in categories: atc_dataset_abbrs.append(weighted_average_score_entry)
title = f'######## Needlebench-ATC-{category}-Score ########'
atc_dataset_abbrs.append(title) needlebench_atc_summarizer = dict(
dataset_abbrs=[
weighted_average_score_entry = [f'{category}', 'weighted_average'] *atc_dataset_abbrs,
atc_dataset_abbrs.append(weighted_average_score_entry) '######## Needlebench-ATC Accuracy ########', # category
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list],
if atc_dataset_abbrs[-1] == '------------------------------------------': '------------------------------------------',
atc_dataset_abbrs.pop() *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list],
'------------------------------------------',
needlebench_atc_summarizer = dict( *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list],
dataset_abbrs=[ '------------------------------------------',
*atc_dataset_abbrs, *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list],
'######## Needlebench-ATC Accuracy ########', # category '------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'acc_1'] for num_needles in needle_num_list], '######## Needlebench-ATC CircularEval ########', # category
'------------------------------------------', *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list],
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'acc_1'] for num_needles in needle_num_list], '------------------------------------------',
'------------------------------------------', *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list],
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'acc_1'] for num_needles in needle_num_list], '------------------------------------------',
'------------------------------------------', *[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list],
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'acc_1'] for num_needles in needle_num_list], '------------------------------------------',
'------------------------------------------', *[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'######## Needlebench-ATC CircularEval ########', # category '------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH', 'perf_4'] for num_needles in needle_num_list], ],
'------------------------------------------', summary_groups=needlebench_atc_summary_groups
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN', 'perf_4'] for num_needles in needle_num_list], )
'------------------------------------------', return needlebench_atc_summarizer
*[[f'NeedleBenchATCDataset-{num_needles}Needle-ZH-Reasoning', 'perf_4'] for num_needles in needle_num_list],
'------------------------------------------',
*[[f'NeedleBenchATCDataset-{num_needles}Needle-EN-Reasoning', 'perf_4'] for num_needles in needle_num_list], atc_summarizer_20 = gen_atc_summarizer(list(range(2, 20, 1)))
'------------------------------------------', atc_summarizer_50 = gen_atc_summarizer(list(range(2, 50, 1)))
], atc_summarizer_80 = gen_atc_summarizer(list(range(2, 80, 1)))
summary_groups=needlebench_atc_summary_groups
)
...@@ -5,6 +5,7 @@ import getpass ...@@ -5,6 +5,7 @@ import getpass
import math import math
import os import os
import os.path as osp import os.path as osp
import shutil
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
...@@ -26,6 +27,92 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg, ...@@ -26,6 +27,92 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
model_abbr_from_cfg) model_abbr_from_cfg)
from opencompass.utils.prompt import get_prompt_hash from opencompass.utils.prompt import get_prompt_hash
model_name_mapping = {
'llama-2-7b-chat-hf': 'LLaMA-2-7B',
'llama-2-13b-chat-hf': 'LLaMA-2-13B',
'llama-2-70b-chat-hf': 'LLaMA-2-70B',
'baichuan2-7b-chat-hf': 'Baichuan2-7B',
'baichuan2-13b-chat-hf': 'Baichuan2-13B',
'yi-6b-chat-hf': 'Yi-6B',
'yi-34b-chat-hf': 'Yi-34B',
'deepseek-67b-chat-hf': 'DeepSeek-67B',
'wizardlm-70b-v1.0-vllm': 'WizardLM-70B',
'qwen-14b-chat-hf': 'Qwen-14B',
'qwen-72b-chat-hf': 'Qwen-72B',
'qwen-72b-chat-vllm': 'Qwen-72B-vLLM',
'internlm2-chat-7b-turbomind': 'InternLM2-7B-200K',
'internlm2-chat-20b-turbomind': 'InternLM2-20B-200K',
'internlm2-chat-7b-hf': 'InternLM2-7B',
'internlm2-chat-20b-hf': 'InternLM2-20B',
'qwen-7b-chat-hf': 'Qwen-7B',
'chatglm3-6b-hf': 'ChatGLM3-6B',
'chatglm3-6b-32k-hf': 'ChatGLM3-6B-32K',
'zephyr-7b-beta-vllm': 'Zephyr-7B Beta',
'mistral-7b-instruct-v0.2-vllm': 'Mistral-7B Inst. v0.2',
'mistral-7b-instruct-v0.1-vllm': 'Mistral-7B Inst. v0.1',
'mixtral-8x7b-instruct-v0.1-vllm': 'Mixtral-8x7B Inst. v0.1',
'orionstar-yi-34b-chat-hf': 'OrionStar-Yi-34B',
'orionstar-14b-long-chat-vllm': 'Orion-14B-LongChat',
'internlm-chat-7b-hf': 'InternLM-7B',
'gemma-2b-it-hf': 'Gemma-2B',
'gemma-7b-it-hf': 'Gemma-7B',
'qwen1.5-0.5b-chat-hf': 'Qwen-1.5-0.5B',
'qwen1.5-1.8b-chat-hf': 'Qwen-1.5-1.8B',
'qwen1.5-4b-chat-hf': 'Qwen-1.5-4B',
'qwen1.5-14b-chat-hf': 'Qwen-1.5-14B',
'qwen1.5-72b-chat-hf': 'Qwen-1.5-72B',
'qwen1.5-14b-chat-vllm': 'Qwen-1.5-14B-vLLM',
'qwen1.5-72b-chat-vllm': 'Qwen-1.5-72B-vLLM',
'glm4_notools': 'GLM-4',
'claude-3-opus': 'Claude-3-Opus',
# Add more mappings as necessary
}
dataset_mapping_dict = {}
needle_counts = ['2', '3', '4', '5']
languages = ['en', 'zh']
sizes = ['4k', '8k', '32k', '200k', '1000k']
types = ['origin', 'parallel']
for needle_count in needle_counts:
for language in languages:
for size in sizes:
key = f'{needle_count}needle_{language}_{size}'
value = f'{needle_count}-Needle-Reasoning-{language.upper()}-{size.upper()}'
dataset_mapping_dict[key] = value
for t in types:
for language in languages:
for size in sizes:
if t == 'origin':
key = f'{t}_{language}_{size}'
value = f'Single-Needle-Retrieval-{language.upper()}-{size.upper()}'
elif t == 'parallel':
key = f'{t}_{language}_{size}'
value = f'Multi-Needle-Retrieval-{language.upper()}-{size.upper()}'
dataset_mapping_dict[key] = value
def calculate_elementwise_average(model_name, merged_df):
score_columns = [col for col in merged_df.columns if col != 'dataset']
origin_columns = [col for col in score_columns if 'origin' in col]
parallel_columns = [col for col in score_columns if 'parallel' in col]
multi_columns = [col for col in score_columns if 'needle' in col]
if origin_columns and parallel_columns and multi_columns:
origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4
parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3
multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3
merged_df[model_name] = origin_avg + parallel_avg + multi_avg
else:
relevant_columns = origin_columns or parallel_columns or multi_columns
if relevant_columns:
merged_df[model_name] = merged_df[relevant_columns].mean(axis=1)
else:
merged_df[model_name] = pd.Series([0] * len(merged_df))
return merged_df.iloc[:, [0, -1]]
def read_after_specific_line_except_last(file_name, keyword, offset): def read_after_specific_line_except_last(file_name, keyword, offset):
with open(file_name, 'r', encoding='utf-8') as file: with open(file_name, 'r', encoding='utf-8') as file:
...@@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False ...@@ -65,6 +152,12 @@ def create_model_dataframe(nested_dict, model_name, dataset_abbr, parallel=False
df = pd.DataFrame(data, columns=['dataset', model_name]) df = pd.DataFrame(data, columns=['dataset', model_name])
return df return df
def convert_to_k(value):
try:
return f'{int(value) // 1000}k'
except ValueError:
return value
def parse_model_scores(text): def parse_model_scores(text):
lines = text.split('\n') lines = text.split('\n')
...@@ -82,8 +175,86 @@ def parse_model_scores(text): ...@@ -82,8 +175,86 @@ def parse_model_scores(text):
return result_dict return result_dict
def remove_empty_subfolders(plot_path):
for folder_name in tqdm(os.listdir(plot_path),
desc='Deleting Empty folders'):
folder_path = os.path.join(plot_path, folder_name)
if os.path.isdir(folder_path):
if not os.listdir(folder_path):
shutil.rmtree(folder_path)
def save_results_to_plots(txt_results_save_path):
content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2)
parsed_data = parse_model_scores(content)
model_names = get_dict_model_names(parsed_data)
numbers = [2, 3, 4, 5]
languages = ['en', 'zh']
size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k']
for size in sizes_origin:
if size in content:
size_exists.append(size)
multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists]
origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists]
parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists]
dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \
parallel_dataset_abbrs
base_path = os.path.dirname(txt_results_save_path)
plot_path = os.path.join(base_path, 'plots')
model_scores = {}
for model_name in tqdm(model_names):
model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model
for dataset_abbr in dataset_abbrs:
parallel_flag = 'parallel' in dataset_abbr
folder_path = os.path.join(plot_path, dataset_mapping_dict[dataset_abbr])
ensure_directory(folder_path)
save_path = os.path.join(folder_path, f'{model_name}.png')
df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag)
score = visualize(df, save_path, model_name, dataset_abbr)
model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score)
overall_dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + parallel_dataset_abbrs
overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png')
merged_df = merge_dataframes(model_name, overall_dataset_abbrs, parsed_data)
averaged_df = calculate_elementwise_average(model_name, merged_df)
overall_score = visualize(averaged_df, overall_score_pic_path, model_name, 'Overall Score')
# Single-Retrieval
single_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_single_retrieval_overall.png')
single_retrieval_merged_df = merge_dataframes(model_name, origin_dataset_abbrs, parsed_data)
single_retrieval_averaged_df = calculate_elementwise_average(model_name, single_retrieval_merged_df)
single_retrieval_overall_score = visualize(single_retrieval_averaged_df, single_retrieval_score_pic_path, model_name, 'Single-Retrieval Overall Score')
# Multi-Retrieval
multi_retrieval_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_retrieval_overall.png')
multi_retrieval_merged_df = merge_dataframes(model_name, parallel_dataset_abbrs, parsed_data)
multi_retrieval_averaged_df = calculate_elementwise_average(model_name, multi_retrieval_merged_df)
multi_retrieval_overall_score = visualize(multi_retrieval_averaged_df, multi_retrieval_score_pic_path, model_name, 'Multi-Retrieval Overall Score')
# Multi-Reasoning
multi_reasoning_score_pic_path = os.path.join(plot_path, f'{model_name}_multi_reasoning_overall.png')
multi_reasoning_merged_df = merge_dataframes(model_name, multi_dataset_abbrs, parsed_data)
multi_reasoning_averaged_df = calculate_elementwise_average(model_name, multi_reasoning_merged_df)
multi_reasoning_overall_score = visualize(multi_reasoning_averaged_df, multi_reasoning_score_pic_path, model_name, 'Multi-Reasoning Overall Score')
model_scores[model_name] = averaged_df
remove_empty_subfolders(plot_path)
return model_scores
def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
df = df_raw.copy() df = df_raw.copy()
if df.empty:
return -1
df['Context Length'] = df['dataset'].apply( df['Context Length'] = df['dataset'].apply(
lambda x: int(x.split('Length')[1].split('Depth')[0])) lambda x: int(x.split('Length')[1].split('Depth')[0]))
df['Document Depth'] = df['dataset'].apply( df['Document Depth'] = df['dataset'].apply(
...@@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): ...@@ -98,144 +269,96 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
model_df = df[['Document Depth', 'Context Length', model_df = df[['Document Depth', 'Context Length',
model_name]].copy() model_name]].copy()
model_df.rename(columns={model_name: 'Score'}, inplace=True) model_df.rename(columns={model_name: 'Score'}, inplace=True)
# Create pivot table
pivot_table = pd.pivot_table(model_df, pivot_table = pd.pivot_table(model_df,
values='Score', values='Score',
index=['Document Depth'], index=['Document Depth'],
columns=['Context Length'], columns=['Context Length'],
aggfunc='mean') aggfunc='mean')
# Calculate mean scores
mean_scores = pivot_table.mean().values mean_scores = pivot_table.mean().values
# Calculate overall score
overall_score = mean_scores.mean() overall_score = mean_scores.mean()
plt.figure(figsize=(10, 6))
# Create heatmap and line plot
plt.figure(figsize=(15.5, 8))
ax = plt.gca() ax = plt.gca()
cmap = LinearSegmentedColormap.from_list( cmap = LinearSegmentedColormap.from_list(
'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F']) 'custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
# Draw heatmap
sns.heatmap(pivot_table, sns.heatmap(pivot_table,
cmap=cmap, cmap=cmap,
ax=ax, ax=ax,
cbar_kws={'label': 'Score'},
vmin=0, vmin=0,
vmax=100) vmax=100)
cbar = ax.collections[0].colorbar
# Set line plot data
x_data = [i + 0.5 for i in range(len(mean_scores))] x_data = [i + 0.5 for i in range(len(mean_scores))]
y_data = mean_scores y_data = mean_scores
# Create twin axis for line plot
ax2 = ax.twinx() ax2 = ax.twinx()
# Draw line plot
ax2.plot(x_data, ax2.plot(x_data,
y_data, y_data,
color='white', color='white',
marker='o', marker='o',
linestyle='-', linestyle='-',
linewidth=2, linewidth=2,
markersize=8, markersize=8,
label='Average Depth Score') label='Average Depth Score'
# Set y-axis range )
ax2.set_ylim(0, 100) ax2.set_ylim(0, 100)
# Hide original y-axis ticks and labels
ax2.set_yticklabels([]) ax2.set_yticklabels([])
ax2.set_yticks([]) ax2.set_yticks([])
# Add legend ax2.legend(loc='lower left')
ax2.legend(loc='upper left')
# Set chart title and labels
ax.set_title(f'{model_name} {dataset_type} Context '
'Performance\nFact Retrieval Across '
'Context Lengths ("Needle In A Haystack")')
ax.set_xlabel('Token Limit')
ax.set_ylabel('Depth Percent')
ax.set_xticklabels(pivot_table.columns.values, rotation=45)
ax.set_yticklabels(pivot_table.index.values, rotation=0)
# Add overall score as a subtitle
plt.text(0.5,
-0.13, f'Overall Score for {model_name}: '
f'{overall_score:.2f}',
ha='center',
va='center',
transform=ax.transAxes,
fontsize=13)
plt.tight_layout()
plt.subplots_adjust(right=1)
plt.draw()
plt.savefig(save_path)
print(f'Saved :{save_path}')
plt.close() # Close figure to prevent memory leaks
return overall_score
def save_results_to_plots(txt_results_save_path):
content = read_after_specific_line_except_last(txt_results_save_path, 'raw format', 2)
parsed_data = parse_model_scores(content)
model_names = get_dict_model_names(parsed_data)
numbers = [2, 3, 4, 5]
languages = ['en', 'zh']
size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k']
for size in sizes_origin:
if size in content:
size_exists.append(size)
multi_dataset_abbrs = [f'{num}needle_{lang}{size}' for num in numbers for lang in languages for size in size_exists]
origin_dataset_abbrs = [f'origin_{lang}{size}' for lang in languages for size in size_exists]
parallel_dataset_abbrs = [f'parallel_{lang}{size}' for lang in languages for size in size_exists]
dataset_abbrs = multi_dataset_abbrs + origin_dataset_abbrs + \
parallel_dataset_abbrs
base_path = os.path.dirname(txt_results_save_path)
plot_path = os.path.join(base_path, 'plots')
model_scores = {}
for model_name in tqdm(model_names):
model_datasets_scores = {} # Dictionary to store scores for each dataset for the current model
for dataset_abbr in dataset_abbrs:
parallel_flag = 'parallel' in dataset_abbr
# Create a directory for each dataset_abbr if model_name in model_name_mapping:
folder_path = os.path.join(plot_path, dataset_abbr) title_name = model_name_mapping[model_name]
ensure_directory(folder_path) else:
title_name = model_name
# Construct the full path to save the image ax.set_title(title_name, fontsize=12, fontweight='bold', pad=15)
save_path = os.path.join(folder_path, f'{model_name}.png')
# Create DataFrame for the model and dataset if dataset_type in dataset_mapping_dict:
df = create_model_dataframe(parsed_data, model_name, dataset_abbr, parallel=parallel_flag) dataset_name = dataset_mapping_dict[dataset_type]
else:
dataset_name = dataset_type
ax.text(0.5, 1.005, f'{dataset_name}:{overall_score:.2f}',
transform=ax.transAxes,
ha='center',
fontsize=12,
fontweight='normal')
ax.set_xlabel('Token Length', fontsize=13, fontweight='normal', labelpad=1)
ax.set_ylabel('Depth Percent(%)', fontsize=13, fontweight='normal', labelpad=1)
converted_labels = [convert_to_k(value) for value in pivot_table.columns.values]
ax.tick_params(axis='both', which='major', length=1, pad=1)
ax.tick_params(axis='both', which='minor', length=1, pad=1)
ax.set_xticklabels(converted_labels, rotation=45)
index_length = len(pivot_table.index)
selected_indices = pivot_table.index.values[::2]
labels = [str(int(index)) for index in selected_indices]
ax.set_yticks(np.arange(0, len(pivot_table.index), 2))
ax.set_yticklabels(labels, rotation=0)
for spine in ax.spines.values():
spine.set_visible(False)
for spine in ax2.spines.values():
spine.set_visible(False)
# Generate visualization and get the score plt.tight_layout()
score = visualize(df, save_path, model_name, dataset_abbr) plt.draw()
directory_path, original_filename = os.path.split(save_path)
# Store the score in the dictionary filename_suffix = (title_name+'_'+dataset_name).replace(' ', '_')
model_datasets_scores[dataset_abbr] = '{:.02f}'.format(score) new_filename = f'{filename_suffix}.png'
# Process and visualize the overall score new_save_path = os.path.join(directory_path, new_filename)
overall_score_pic_path = os.path.join(plot_path, f'{model_name}_overall.png')
merged_df = merge_dataframes(model_name, dataset_abbrs, parsed_data)
print(merge_dataframes) plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0)
averaged_df = calculate_elementwise_average(merged_df) print(f'Saved :{new_save_path}')
# Assume visualize returns the average score for the overall visualization plt.close()
overall_score = visualize(averaged_df, overall_score_pic_path, 'weighted_average_score', 'Overall Score')
# Add the overall score to the dictionary return overall_score
model_datasets_scores['Overall'] = '{:.02f}'.format(overall_score)
# Add the model's scores to the main dictionary
model_scores[model_name] = model_datasets_scores
def ensure_directory(path): def ensure_directory(path):
if not os.path.exists(path): if not os.path.exists(path):
...@@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data): ...@@ -263,29 +386,11 @@ def merge_dataframes(model_name, dataset_abbrs, parsed_data):
merged_df = reduce(lambda left, right: pd.merge(left, right, on='dataset', how='outer'), dfs) merged_df = reduce(lambda left, right: pd.merge(left, right, on='dataset', how='outer'), dfs)
if merged_df.isnull().any().any(): if merged_df.isnull().any().any():
print('Warning: Some rows were filtered out due to NaN values. This is often due to mismatched row counts among DataFrames.') print('Warning: Some rows were filtered out due to NaN values. '
'This is often due to mismatched row counts among DataFrames.')
merged_df = merged_df.dropna() merged_df = merged_df.dropna()
return merged_df return merged_df
def calculate_elementwise_average(merged_df):
score_columns = [col for col in merged_df.columns if col != 'dataset']
origin_columns = [col for col in score_columns if 'origin' in col]
parallel_columns = [col for col in score_columns if 'parallel' in col]
multi_columns = [col for col in score_columns if 'needle' in col]
if origin_columns and parallel_columns and multi_columns:
origin_avg = merged_df[origin_columns].mean(axis=1) * 0.4
parallel_avg = merged_df[parallel_columns].mean(axis=1) * 0.3
multi_avg = merged_df[multi_columns].mean(axis=1) * 0.3
merged_df['weighted_average_score'] = origin_avg + parallel_avg + multi_avg
else:
merged_df['weighted_average_score'] = pd.Series([0] * len(merged_df))
return merged_df.iloc[:, [0, -1]]
class NeedleBenchSummarizer(DefaultSummarizer): class NeedleBenchSummarizer(DefaultSummarizer):
"""NeedleBench summarizer in OpenCompass. """NeedleBench summarizer in OpenCompass.
...@@ -303,20 +408,17 @@ class NeedleBenchSummarizer(DefaultSummarizer): ...@@ -303,20 +408,17 @@ class NeedleBenchSummarizer(DefaultSummarizer):
summarizer_dataset_abbrs = [] summarizer_dataset_abbrs = []
if self.dataset_abbrs is None: if self.dataset_abbrs is None:
# display all dataset metrics included in the config
for dataset_abbr in dataset_abbrs: for dataset_abbr in dataset_abbrs:
if dataset_abbr in dataset_metrics: if dataset_abbr in dataset_metrics:
for metric in dataset_metrics[dataset_abbr]: for metric in dataset_metrics[dataset_abbr]:
summarizer_dataset_abbrs.append((dataset_abbr, metric)) summarizer_dataset_abbrs.append((dataset_abbr, metric))
else: else:
summarizer_dataset_abbrs.append((dataset_abbr, None)) summarizer_dataset_abbrs.append((dataset_abbr, None))
# along with all possible group metrics
for dataset_abbr in dataset_metrics: for dataset_abbr in dataset_metrics:
for metric in dataset_metrics[dataset_abbr]: for metric in dataset_metrics[dataset_abbr]:
if (dataset_abbr, metric) not in summarizer_dataset_abbrs: if (dataset_abbr, metric) not in summarizer_dataset_abbrs:
summarizer_dataset_abbrs.append((dataset_abbr, metric)) summarizer_dataset_abbrs.append((dataset_abbr, metric))
else: else:
# follow the required order
for item in self.dataset_abbrs: for item in self.dataset_abbrs:
if isinstance(item, str): if isinstance(item, str):
summarizer_dataset_abbrs.append((item, None)) summarizer_dataset_abbrs.append((item, None))
...@@ -332,6 +434,7 @@ class NeedleBenchSummarizer(DefaultSummarizer): ...@@ -332,6 +434,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
for dataset_abbr, metric in summarizer_dataset_abbrs: for dataset_abbr, metric in summarizer_dataset_abbrs:
if dataset_abbr not in dataset_metrics: if dataset_abbr not in dataset_metrics:
table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
table.append(header) table.append(header)
continue continue
...@@ -378,33 +481,7 @@ class NeedleBenchSummarizer(DefaultSummarizer): ...@@ -378,33 +481,7 @@ class NeedleBenchSummarizer(DefaultSummarizer):
raw_txts = '\n'.join(raw_txts) raw_txts = '\n'.join(raw_txts)
return raw_txts return raw_txts
def _read_and_sort_dataframe(self, file_path):
# Read the file without treating the first row as a header
df = pd.read_csv(file_path, header=None)
# Function to sort columns based on the value of a specific row, excluding the first column
def sort_columns_based_on_row_corrected(df, base_row_idx, start_row_idx, end_row_idx):
# Extract the rows for sorting
sort_values_row = df.iloc[base_row_idx, 1:].replace('-', np.nan).apply(pd.to_numeric, errors='coerce')
# Handle NaNs by setting them to a value less than the minimum or using a method to keep them at the end
min_possible_value = sort_values_row.min(skipna=True) - 1 # Use min value in the row minus 1 or another method
sort_values_row_filled = sort_values_row.fillna(min_possible_value)
# Get the sorted order of indices, excluding the first column
sorted_col_indices = sort_values_row_filled.sort_values(ascending=False).index
# Apply the sorted column indices to the whole DataFrame, adjusting for Python's 0-based index
df.iloc[start_row_idx:end_row_idx+1] = df.iloc[start_row_idx:end_row_idx+1, [0] + sorted_col_indices.tolist()]
# Apply the corrected sorting function based on the description
sort_columns_based_on_row_corrected(df, 1, 0, 2) # For rows 1-2 based on row 2's values
sort_columns_based_on_row_corrected(df, 4, 3, 7) # For rows 4-7 based on row 5's values
sort_columns_based_on_row_corrected(df, 9, 8, 12) # For rows 9-12 based on row 10's values
sort_columns_based_on_row_corrected(df, 14, 13, 25) # For rows 14-25 based on row 15's values
# Return the sorted DataFrame
return df
def _output_to_file(self, output_path, time_str, table, raw_txts): def _output_to_file(self, output_path, time_str, table, raw_txts):
# output to file
if output_path is None: if output_path is None:
output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt') output_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.txt')
output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv') output_csv_path = osp.join(self.work_dir, 'summary', f'summary_{time_str}.csv')
...@@ -436,38 +513,19 @@ class NeedleBenchSummarizer(DefaultSummarizer): ...@@ -436,38 +513,19 @@ class NeedleBenchSummarizer(DefaultSummarizer):
f.write('\n'.join([','.join(row) for row in table]) + '\n') f.write('\n'.join([','.join(row) for row in table]) + '\n')
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}') self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
df_sorted = self._read_and_sort_dataframe(output_csv_path)
sorted_file_path = osp.abspath(output_csv_path).split('.')[0] + '_sorted.csv'
df_sorted.to_csv(sorted_file_path, index=False, header=False)
self.logger.info(f'write sorted csv to {sorted_file_path}')
def summarize( def summarize(
self, self,
output_path: str = None, output_path: str = None,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): # noqa
# pick up results
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results() raw_results, parsed_results, dataset_metrics, dataset_eval_mode = self._pick_up_results()
# calculate group metrics
raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \ raw_results, parsed_results, dataset_metrics, dataset_eval_mode = \
self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode) self._calculate_group_metrics(raw_results, parsed_results, dataset_metrics, dataset_eval_mode)
# format table
table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode) table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode)
# format raw txt
raw_txts = self._format_raw_txt(raw_results) raw_txts = self._format_raw_txt(raw_results)
# output to screen
print(tabulate.tabulate(table, headers='firstrow')) print(tabulate.tabulate(table, headers='firstrow'))
# output to .text / .csv files
self._output_to_file(output_path, time_str, table, raw_txts) self._output_to_file(output_path, time_str, table, raw_txts)
if self.lark_reporter: if self.lark_reporter:
content = f'{getpass.getuser()} 的' content = f'{getpass.getuser()} 的'
content += f'详细评测汇总已输出至 {osp.abspath(output_path)}' content += f'详细评测汇总已输出至 {osp.abspath(output_path)}'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment