Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
DataAnalysis
Commits
9afddf86
Commit
9afddf86
authored
Mar 16, 2026
by
sharkgene@qq.com
Browse files
init version
parent
089c9e5a
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
376 additions
and
0 deletions
+376
-0
README.md
README.md
+40
-0
data_config.json
data_config.json
+19
-0
plot_comparison.py
plot_comparison.py
+317
-0
No files found.
README.md
View file @
9afddf86
# DataAnalysis
将大模型推理测试生成的excel数据文件,按照特定方式生成柱状图进行比较
配置文件data_config.json示例说明:
{
"filter": {
"并发数": [],
"模型": []
},
"distinguish": ["模型", "卡类型", "卡数"],
"group_by": [["并发数"], ["输入长度(tokens)", "输出长度(tokens)"]],
"files": [
{
"file": "test100.xlsx",
"sheets": [],
"column_mapping": {
},
"column_replace": {
}
}
]
}
filter: 过滤条件,不设置或者为空则不过滤
distinguish: 选择输出文件的条件,例如["模型", "卡类型", "卡数"]则将相同的模型、卡类型和卡数的所有数据生成以个文件
group_by: 可以进行最多2重分组,每层一个图,第2层以不同柱状显示数据。如果命令行参数指定合并,则所有合成一个图,分组之间间隔开
files: 指定文件信息
## 使用方法
python3 plot_comparison.py --help
usage: plot_comparison.py [-h] [--配置 配置] [--输出目录 输出目录] [--合并分组]
绘制模型性能对比图表
options:
-h, --help show this help message and exit
--配置 配置, -f 配置 数据配置文件路径
--输出目录 输出目录, -d 输出目录 输出图表目录
--合并分组, -m 将第一层分组合并到一张图中
##
data_config.json
0 → 100644
View file @
9afddf86
{
"filter"
:
{
"并发数"
:
[],
"模型"
:
[]
},
"distinguish"
:
[
"模型"
,
"卡类型"
,
"卡数"
],
"group_by"
:
[[
"并发数"
],
[
"输入长度(tokens)"
,
"输出长度(tokens)"
]],
"files"
:
[
{
"file"
:
"test100.xlsx"
,
"sheets"
:
[],
"column_mapping"
:
{
},
"column_replace"
:
{
}
}
]
}
plot_comparison.py
0 → 100644
View file @
9afddf86
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
matplotlib.cm
as
cm
import
numpy
as
np
import
argparse
import
json
import
os
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
,
'Arial Unicode MS'
,
'DejaVu Sans'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
parser
=
argparse
.
ArgumentParser
(
description
=
'绘制模型性能对比图表'
)
parser
.
add_argument
(
'--配置'
,
'-f'
,
type
=
str
,
default
=
'data_config.json'
,
help
=
'数据配置文件路径'
)
parser
.
add_argument
(
'--输出目录'
,
'-d'
,
type
=
str
,
default
=
'charts'
,
help
=
'输出图表目录'
)
parser
.
add_argument
(
'--合并分组'
,
'-m'
,
action
=
'store_true'
,
help
=
'将第一层分组合并到一张图中'
)
args
=
parser
.
parse_args
()
def
load_data_from_files
(
config
):
all_data
=
[]
files_config
=
config
.
get
(
'files'
,
[])
for
file_config
in
files_config
:
file_path
=
file_config
.
get
(
'file'
)
sheets
=
file_config
.
get
(
'sheets'
,
[])
column_mapping
=
file_config
.
get
(
'column_mapping'
,
{})
if
not
os
.
path
.
exists
(
file_path
):
print
(
f
"文件不存在:
{
file_path
}
, 跳过"
)
continue
xl
=
pd
.
ExcelFile
(
file_path
)
if
sheets
is
None
or
(
isinstance
(
sheets
,
list
)
and
len
(
sheets
)
==
0
):
sheets
=
xl
.
sheet_names
else
:
sheets
=
[
s
for
s
in
sheets
if
s
]
for
sheet
in
sheets
:
try
:
df
=
pd
.
read_excel
(
file_path
,
sheet_name
=
sheet
)
df
.
columns
=
df
.
columns
.
str
.
replace
(
'
\n
'
,
''
).
str
.
strip
()
if
column_mapping
:
df
=
df
.
rename
(
columns
=
column_mapping
)
column_replace
=
file_config
.
get
(
'column_replace'
,
{})
for
col
,
replace_dict
in
column_replace
.
items
():
if
col
in
df
.
columns
:
df
[
col
]
=
df
[
col
].
replace
(
replace_dict
)
df
[
'source_file'
]
=
file_path
df
[
'source_sheet'
]
=
sheet
all_data
.
append
(
df
)
print
(
f
"读取:
{
file_path
}
-
{
sheet
}
,
{
len
(
df
)
}
行"
)
except
Exception
as
e
:
print
(
f
"读取失败:
{
file_path
}
-
{
sheet
}
:
{
e
}
"
)
if
not
all_data
:
return
pd
.
DataFrame
()
combined_df
=
pd
.
concat
(
all_data
,
ignore_index
=
True
)
return
combined_df
def
apply_filter
(
df
,
filter_dict
):
for
filter_col
,
filter_values
in
filter_dict
.
items
():
if
filter_col
in
df
.
columns
and
filter_values
:
if
isinstance
(
filter_values
,
list
):
df
=
df
[
df
[
filter_col
].
isin
(
filter_values
)]
else
:
df
=
df
[
df
[
filter_col
]
==
filter_values
]
return
df
def
generate_chart
(
df_subset
,
output_path
,
compare_col
,
outer_group_cols
,
inner_group_cols
,
metric_cols
,
merge_groups
=
False
):
df_subset
=
df_subset
.
copy
()
df_subset
[
compare_col
]
=
df_subset
[
'vLLM版本'
].
astype
(
str
)
+
'_'
+
df_subset
[
'V0/V1 Engine'
].
astype
(
str
)
all_group_cols
=
outer_group_cols
+
inner_group_cols
if
all_group_cols
:
df_grouped
=
df_subset
[
all_group_cols
+
[
compare_col
]
+
metric_cols
].
groupby
(
all_group_cols
+
[
compare_col
]).
mean
().
reset_index
()
else
:
df_grouped
=
df_subset
[[
compare_col
]
+
metric_cols
].
groupby
([
compare_col
]).
mean
().
reset_index
()
df_grouped
[
compare_col
]
=
df_grouped
.
index
if
len
(
df_grouped
)
==
0
:
print
(
f
" 无数据,跳过"
)
return
False
if
outer_group_cols
:
outer_values
=
df_grouped
.
groupby
(
outer_group_cols
).
size
().
reset_index
()
else
:
outer_values
=
pd
.
DataFrame
({
''
:
[
'all'
]})
n_outer
=
len
(
outer_values
)
engine_values
=
df_grouped
[
compare_col
].
unique
()
n_engines
=
len
(
engine_values
)
color_palette
=
[
'#2E86AB'
,
'#A23B72'
,
'#F18F01'
,
'#C73E1D'
,
'#3B1F2B'
,
'#95C623'
,
'#7B2D26'
]
colors
=
[
color_palette
[
i
%
len
(
color_palette
)]
for
i
in
range
(
n_engines
)]
if
merge_groups
and
n_outer
>
1
:
fig
,
axes
=
plt
.
subplots
(
1
,
4
,
figsize
=
(
8
*
n_outer
+
20
,
10
))
bar_width
=
0.12
bar_spacing
=
0.05
group_gap
=
3
x_labels_all
=
None
for
col
,
metric
in
enumerate
(
metric_cols
):
ax
=
axes
[
col
]
current_x
=
0
for
row_idx
,
(
_
,
outer_row
)
in
enumerate
(
outer_values
.
iterrows
()):
df_outer
=
df_grouped
.
copy
()
for
gcol
in
outer_group_cols
:
df_outer
=
df_outer
[
df_outer
[
gcol
]
==
outer_row
[
gcol
]]
outer_label_value
=
'-'
.
join
([
str
(
outer_row
[
gcol
])
for
gcol
in
outer_group_cols
])
pt
=
df_outer
.
pivot_table
(
index
=
inner_group_cols
,
columns
=
compare_col
,
values
=
metric
).
fillna
(
0
)
n_bars_per_group
=
len
(
pt
)
group_width
=
n_bars_per_group
*
n_engines
*
(
bar_width
+
bar_spacing
)
+
group_gap
group_center
=
current_x
+
group_width
/
2
x_labels
=
[
'/'
.
join
([
str
(
v
)
for
v
in
idx
])
for
idx
in
pt
.
index
]
if
x_labels_all
is
None
:
x_labels_all
=
x_labels
x
=
np
.
arange
(
len
(
x_labels
))
*
(
n_engines
*
(
bar_width
+
bar_spacing
))
+
current_x
for
i
,
engine
in
enumerate
(
engine_values
):
if
engine
in
pt
.
columns
:
values
=
pt
[
engine
].
values
offset
=
i
*
bar_width
label
=
f
"
{
engine
}
(
{
outer_label_value
}
)"
bars
=
ax
.
bar
(
x
+
offset
,
values
,
bar_width
,
label
=
label
,
color
=
colors
[
i
],
edgecolor
=
'white'
,
linewidth
=
0.5
)
for
bar
,
val
in
zip
(
bars
,
values
):
if
val
>
0
:
y_pos
=
bar
.
get_height
()
+
bar
.
get_height
()
*
0.02
if
bar
.
get_height
()
>
0
else
1
ax
.
text
(
bar
.
get_x
()
+
bar
.
get_width
()
/
2
,
y_pos
,
f
'
{
val
:.
1
f
}
'
,
ha
=
'center'
,
va
=
'bottom'
,
fontsize
=
5
,
fontweight
=
'bold'
)
ax
.
axvline
(
x
=
current_x
+
n_bars_per_group
*
n_engines
*
(
bar_width
+
bar_spacing
)
+
group_gap
/
2
,
color
=
'gray'
,
linestyle
=
'--'
,
linewidth
=
1
)
ax
.
text
(
group_center
,
ax
.
get_ylim
()[
1
]
*
0.95
,
outer_label_value
,
ha
=
'center'
,
va
=
'top'
,
fontsize
=
9
,
fontweight
=
'bold'
,
bbox
=
dict
(
boxstyle
=
'round'
,
facecolor
=
'wheat'
,
alpha
=
0.5
))
current_x
=
current_x
+
n_bars_per_group
*
n_engines
*
(
bar_width
+
bar_spacing
)
+
group_gap
total_inner_labels
=
len
(
x_labels_all
)
inner_positions
=
[]
inner_labels
=
[]
for
gi
in
range
(
len
(
outer_values
)):
base_x
=
gi
*
(
total_inner_labels
*
n_engines
*
(
bar_width
+
bar_spacing
)
+
group_gap
)
for
xi
in
range
(
total_inner_labels
):
center_pos
=
base_x
+
xi
*
n_engines
*
(
bar_width
+
bar_spacing
)
+
(
n_engines
*
bar_width
+
(
n_engines
-
1
)
*
bar_spacing
)
/
2
inner_positions
.
append
(
center_pos
)
inner_labels
.
append
(
x_labels_all
[
xi
])
ax
.
set_xticks
(
inner_positions
)
ax
.
set_xticklabels
(
inner_labels
,
rotation
=
45
,
ha
=
'right'
,
fontsize
=
6
)
ax
.
set_xlabel
(
'/'
.
join
(
inner_group_cols
),
fontsize
=
9
)
ax
.
set_ylabel
(
metric
,
fontsize
=
10
)
ax
.
set_title
(
f
'
{
metric
}
'
,
fontsize
=
12
,
fontweight
=
'bold'
)
ax
.
grid
(
axis
=
'y'
,
alpha
=
0.3
,
linestyle
=
'--'
)
ax
.
legend
(
fontsize
=
5
,
loc
=
'upper right'
,
framealpha
=
0.9
,
ncol
=
1
)
else
:
fig
,
axes
=
plt
.
subplots
(
n_outer
,
4
,
figsize
=
(
24
,
5
*
n_outer
))
if
n_outer
==
1
:
axes
=
axes
.
reshape
(
1
,
-
1
)
bar_width
=
0.2
outer_label
=
'/'
.
join
(
outer_group_cols
)
if
outer_group_cols
else
'全部'
for
row_idx
,
(
_
,
outer_row
)
in
enumerate
(
outer_values
.
iterrows
()):
df_outer
=
df_grouped
.
copy
()
for
col
in
outer_group_cols
:
df_outer
=
df_outer
[
df_outer
[
col
]
==
outer_row
[
col
]]
outer_label_value
=
'-'
.
join
([
str
(
outer_row
[
col
])
for
col
in
outer_group_cols
])
for
col
,
metric
in
enumerate
(
metric_cols
):
ax
=
axes
[
row_idx
,
col
]
pt
=
df_outer
.
pivot_table
(
index
=
inner_group_cols
,
columns
=
compare_col
,
values
=
metric
).
fillna
(
0
)
x_labels
=
[
'/'
.
join
([
str
(
v
)
for
v
in
idx
])
for
idx
in
pt
.
index
]
x
=
np
.
arange
(
len
(
x_labels
))
for
i
,
engine
in
enumerate
(
engine_values
):
if
engine
in
pt
.
columns
:
values
=
pt
[
engine
].
values
offset
=
(
i
-
n_engines
/
2
+
0.5
)
*
bar_width
bars
=
ax
.
bar
(
x
+
offset
,
values
,
bar_width
,
label
=
engine
,
color
=
colors
[
i
],
edgecolor
=
'white'
,
linewidth
=
0.5
)
for
bar
,
val
in
zip
(
bars
,
values
):
if
val
>
0
:
y_pos
=
bar
.
get_height
()
+
bar
.
get_height
()
*
0.02
if
bar
.
get_height
()
>
0
else
1
ax
.
text
(
bar
.
get_x
()
+
bar
.
get_width
()
/
2
,
y_pos
,
f
'
{
val
:.
1
f
}
'
,
ha
=
'center'
,
va
=
'bottom'
,
fontsize
=
7
,
fontweight
=
'bold'
)
ax
.
set_xlabel
(
'/'
.
join
(
inner_group_cols
),
fontsize
=
9
)
ax
.
set_ylabel
(
metric
,
fontsize
=
10
)
ax
.
set_title
(
f
'
{
outer_label
}
=
{
outer_label_value
}
-
{
metric
}
'
,
fontsize
=
11
,
fontweight
=
'bold'
)
ax
.
set_xticks
(
x
)
ax
.
set_xticklabels
(
x_labels
,
rotation
=
45
,
ha
=
'right'
,
fontsize
=
7
)
ax
.
grid
(
axis
=
'y'
,
alpha
=
0.3
,
linestyle
=
'--'
)
ax
.
legend
(
fontsize
=
6
,
loc
=
'upper right'
,
framealpha
=
0.9
,
ncol
=
1
)
plt
.
tight_layout
()
plt
.
savefig
(
output_path
,
dpi
=
150
,
bbox_inches
=
'tight'
,
facecolor
=
'white'
)
plt
.
close
()
return
True
print
(
f
"从配置文件加载数据:
{
args
.
配置
}
"
)
with
open
(
args
.
配置
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
config
=
json
.
load
(
f
)
df
=
load_data_from_files
(
config
)
if
df
.
empty
:
print
(
"未加载到数据"
)
exit
(
1
)
print
(
f
"
\n
可用列名:
{
df
.
columns
.
tolist
()
}
"
)
col_mapping
=
{}
for
std_col
,
alt_cols
in
[
(
'模型'
,
[
'模型'
,
'model'
,
'Model'
]),
(
'卡类型'
,
[
'卡类型'
,
'card_type'
,
'卡'
]),
(
'卡数'
,
[
'卡数'
,
'num_cards'
,
'卡数'
,
'GPU数量'
]),
(
'vLLM版本'
,
[
'vLLM版本'
,
'vllm_version'
,
'vLLM版本'
]),
(
'V0/V1 Engine'
,
[
'V0/V1 Engine'
,
'Engine'
,
'engine'
]),
(
'输入长度(tokens)'
,
[
'输入长度(tokens)'
,
'input_length'
,
'input length'
,
'输入长度'
]),
(
'输出长度(tokens)'
,
[
'输出长度(tokens)'
,
'output_length'
,
'output length'
,
'输出长度'
]),
(
'并发数'
,
[
'并发数'
,
'concurrency'
,
'并发'
,
'num_concurrent'
]),
(
'平均首字延时TTFT(ms)'
,
[
'平均首字延时TTFT(ms)'
,
'ttft'
,
'TTFT'
,
'首字延时'
]),
(
'平均生成时间TPOT(ms)'
,
[
'平均生成时间TPOT(ms)'
,
'tpot'
,
'TPOT'
,
'生成时间'
]),
(
'生成吞吐量(tokens/s)'
,
[
'生成吞吐量(tokens/s)'
,
'gen_throughput'
,
'生成吞吐'
]),
(
'总吞吐量(tokens/s)'
,
[
'总吞吐量(tokens/s)'
,
'total_throughput'
,
'总吞吐'
])
]:
for
alt
in
alt_cols
:
if
alt
in
df
.
columns
:
col_mapping
[
std_col
]
=
alt
print
(
f
"
\n
列映射:
{
col_mapping
}
"
)
df_renamed
=
df
.
rename
(
columns
=
col_mapping
)
filter_config
=
config
.
get
(
'filter'
,
{})
df_renamed
=
apply_filter
(
df_renamed
,
filter_config
)
print
(
f
"过滤后数据量:
{
len
(
df_renamed
)
}
"
)
metric_cols
=
[
'平均首字延时TTFT(ms)'
,
'平均生成时间TPOT(ms)'
,
'生成吞吐量(tokens/s)'
,
'总吞吐量(tokens/s)'
]
dist_cols_config
=
config
.
get
(
'distinguish'
,
[
'模型'
,
'卡数'
])
dist_cols
=
[
col_mapping
.
get
(
c
,
c
)
for
c
in
dist_cols_config
]
dist_cols
=
[
c
for
c
in
dist_cols
if
c
in
df_renamed
.
columns
]
os
.
makedirs
(
args
.
输出目录
,
exist_ok
=
True
)
group_by
=
config
.
get
(
'group_by'
,
[[],
[]])
if
isinstance
(
group_by
[
0
],
list
):
outer_group
=
group_by
[
0
]
if
len
(
group_by
)
>
0
else
[]
inner_group
=
group_by
[
1
]
if
len
(
group_by
)
>
1
else
[]
else
:
outer_group
=
[]
inner_group
=
group_by
dist_combinations
=
df_renamed
.
groupby
(
dist_cols
).
size
().
reset_index
()
print
(
f
"
\n
将生成
{
len
(
dist_combinations
)
}
个图表..."
)
chart_count
=
0
for
idx
,
(
_
,
dist_row
)
in
enumerate
(
dist_combinations
.
iterrows
()):
df_subset
=
df_renamed
.
copy
()
for
dist_col
in
dist_cols
:
df_subset
=
df_subset
[
df_subset
[
dist_col
]
==
dist_row
[
dist_col
]]
filter_parts
=
[]
for
dist_col
in
dist_cols
:
val
=
dist_row
[
dist_col
]
safe_col_name
=
dist_col
.
replace
(
'/'
,
'_'
).
replace
(
'
\\
'
,
'_'
)[:
10
]
filter_parts
.
append
(
f
"
{
safe_col_name
}
_
{
val
}
"
)
output_filename
=
'_'
.
join
(
filter_parts
)
+
".png"
output_path
=
os
.
path
.
join
(
args
.
输出目录
,
output_filename
)
print
(
f
"[
{
idx
+
1
}
/
{
len
(
dist_combinations
)
}
] 生成图表:
{
output_filename
}
"
)
success
=
generate_chart
(
df_subset
,
output_path
,
'vLLM_Engine'
,
outer_group
,
inner_group
,
metric_cols
,
args
.
合并分组
)
if
success
:
chart_count
+=
1
print
(
f
"
\n
完成!共生成
{
chart_count
}
个图表,保存到目录:
{
args
.
输出目录
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment