Commit b104ec49 authored by sharkgene@qq.com's avatar sharkgene@qq.com
Browse files

add norm

parent 471cae9f
__pycache__
.*swp
.*swo
.~*
{
"filter": {
"并发数": [],
"模型": []
},
"dist_cols": ["模型", "卡数"],
"group_cols": [["并发数"], ["输入长度(tokens)", "输出长度(tokens)"]],
"key_cols": ["卡类型", "vLLM版本", "V0/V1 Engine"],
"metric_cols": ["平均首字延时TTFT(ms)", "平均生成时间TPOT(ms)", "生成吞吐量(tokens/s)", "总吞吐量(tokens/s)"],
"files": [
{
"file": "K100_AI_2026Baseline.xlsx",
"sheets": [],
"column_mapping": {
},
"column_replace": {
},
"column_add": {
}
}
]
}
...@@ -9,12 +9,48 @@ import os ...@@ -9,12 +9,48 @@ import os
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans'] plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False plt.rcParams['axes.unicode_minus'] = False
parser = argparse.ArgumentParser(description='绘制模型性能对比图表') parser = argparse.ArgumentParser(description='绘制模型性能对比图表(支持归一化)')
parser.add_argument('--配置', '-f', type=str, default='data_config.json', help='数据配置文件路径') parser.add_argument('--配置', '-f', type=str, default='data_config.json', help='数据配置文件路径')
parser.add_argument('--输出目录', '-d', type=str, default='charts', help='输出图表目录') parser.add_argument('--输出目录', '-d', type=str, default='charts', help='输出图表目录')
parser.add_argument('--合并分组', '-m', action='store_true', help='将第一层分组合并到一张图中') parser.add_argument('--合并分组', '-m', action='store_true', help='将第一层分组合并到一张图中')
args = parser.parse_args() args = parser.parse_args()
def fill_merged_cells(df, file_path, sheet_name):
"""填充合并单元格:用前一个非空值向下填充"""
try:
wb = pd.ExcelFile(file_path).book
ws = wb[sheet_name]
merged_ranges = ws.merged_cells.ranges
if not merged_ranges:
return df
for merged_range in merged_ranges:
min_col, min_row = merged_range.min_col, merged_range.min_row
max_col, max_row = merged_range.max_col, merged_range.max_row
first_cell = ws.cell(min_row, min_col).value
for row in range(min_row + 1, max_row + 1):
for col in range(min_col, max_col + 1):
ws.cell(row, col).value = first_cell
df = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
except Exception:
pass
for col in df.columns:
df[col] = df[col].fillna(method='ffill')
if pd.api.types.is_numeric_dtype(df[col]):
col_values = df[col].dropna()
if len(col_values) > 0 and col_values.apply(lambda x: float(x).is_integer() if pd.notna(x) else True).all():
try:
df[col] = df[col].astype(int)
except (ValueError, TypeError):
pass
return df
def load_data_from_files(config): def load_data_from_files(config):
all_data = [] all_data = []
files_config = config.get('files', []) files_config = config.get('files', [])
...@@ -39,6 +75,7 @@ def load_data_from_files(config): ...@@ -39,6 +75,7 @@ def load_data_from_files(config):
for sheet in sheets: for sheet in sheets:
try: try:
df = pd.read_excel(file_path, sheet_name=sheet) df = pd.read_excel(file_path, sheet_name=sheet)
df = fill_merged_cells(df, file_path, sheet)
df.columns = df.columns.str.replace('\n', '').str.strip() df.columns = df.columns.str.replace('\n', '').str.strip()
if column_mapping: if column_mapping:
...@@ -75,11 +112,148 @@ def apply_filter(df, filter_dict): ...@@ -75,11 +112,148 @@ def apply_filter(df, filter_dict):
df = df[df[filter_col] == filter_values] df = df[df[filter_col] == filter_values]
return df return df
def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group_cols, metric_cols, merge_groups=False): def parse_metric_cols(metric_cols, key_cols):
"""解析 metric_cols,分离普通列和需要归一化的列"""
normal_metrics = []
normalize_configs = []
for m in metric_cols:
if isinstance(m, dict):
for col_name, base_value in m.items():
if isinstance(base_value, list):
base_dict = {}
for i, k in enumerate(key_cols):
if i < len(base_value):
base_dict[k] = base_value[i]
normalize_configs.append({
'column': col_name,
'base_value': base_dict
})
elif isinstance(base_value, dict):
normalize_configs.append({
'column': col_name,
'base_value': base_value
})
else:
parts = str(base_value).split('_')
base_dict = {}
for i, k in enumerate(key_cols):
if i < len(parts):
base_dict[k] = parts[i]
normalize_configs.append({
'column': col_name,
'base_value': base_dict
})
elif isinstance(m, list):
for item in m:
if isinstance(item, dict):
for col_name, base_value in item.items():
if isinstance(base_value, list):
base_dict = {}
for i, k in enumerate(key_cols):
if i < len(base_value):
base_dict[k] = base_value[i]
normalize_configs.append({
'column': col_name,
'base_value': base_dict
})
else:
normalize_configs.append({
'column': col_name,
'base_value': base_value
})
else:
normal_metrics.append(item)
else:
normal_metrics.append(m)
return normal_metrics, normalize_configs
def apply_normalization(df, key_cols, normalize_configs, group_cols):
"""对指定列进行归一化处理(每个分组内独立归一化)"""
df = df.copy()
for config in normalize_configs:
col = config['column']
base_value = config['base_value']
if col not in df.columns:
continue
if not key_cols:
print(f" 警告: key_cols为空,跳过归一化")
continue
valid_key_cols = [k for k in key_cols if k in df.columns]
if not valid_key_cols:
print(f" 警告: 未找到有效的key_cols {key_cols},跳过归一化")
continue
if isinstance(base_value, dict):
base_dict = base_value
else:
base_dict = {}
base_parts = base_value.split('_')
for i, k in enumerate(valid_key_cols):
if i < len(base_parts):
base_dict[k] = base_parts[i]
effective_group_cols = [c for c in group_cols if c and c in df.columns]
if effective_group_cols:
def normalize_group(group):
base_mask = pd.Series(True, index=group.index)
for k, v in base_dict.items():
if k in group.columns:
base_mask = base_mask & (group[k] == v)
if base_mask.any():
base_val = group.loc[base_mask, col].mean()
if base_val == 0 or pd.isna(base_val):
valid_data = group[group[col].notna() & (group[col] != 0)]
if len(valid_data) > 0:
base_val = valid_data[col].iloc[0]
else:
return group
else:
valid_data = group[group[col].notna() & (group[col] != 0)]
if len(valid_data) > 0:
base_val = valid_data[col].iloc[0]
else:
return group
if base_val == 0 or pd.isna(base_val):
return group
group[col] = (group[col] / base_val) * 100
return group
df = df.groupby(effective_group_cols, group_keys=False).apply(normalize_group)
print(f" 归一化列 '{col}': 每个分组内基准值 {base_dict} = 100%")
else:
base_mask = pd.Series(True, index=df.index)
for k, v in base_dict.items():
if k in df.columns:
base_mask = base_mask & (df[k] == v)
if not base_mask.any():
print(f" 警告: 未找到基准值 {base_dict},跳过归一化")
continue
base_values = df.loc[base_mask, col].mean()
if base_values == 0 or pd.isna(base_values):
valid_data = df[df[col].notna() & (df[col] != 0)]
if len(valid_data) > 0:
base_values = valid_data[col].iloc[0]
else:
print(f" 警告: 无有效数据,跳过归一化")
continue
df[col] = (df[col] / base_values) * 100
print(f" 归一化列 '{col}': 基准值 {base_dict} = 100%")
return df
def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group_cols, metric_cols,
normalize_configs=None, merge_groups=False):
df_subset = df_subset.copy() df_subset = df_subset.copy()
compare_col = "ColKey" compare_col = "ColKey"
df_subset[compare_col] = df_subset[colkey].apply(lambda x: '_'.join(x.dropna().astype(str)), axis=1) df_subset[compare_col] = df_subset[colkey].apply(lambda x: '_'.join(x.dropna().astype(str)), axis=1)
#df_subset[compare_col] = df_subset['vLLM版本'].astype(str) + '_' + df_subset['V0/V1 Engine'].astype(str)
all_group_cols = outer_group_cols + inner_group_cols all_group_cols = outer_group_cols + inner_group_cols
if all_group_cols: if all_group_cols:
...@@ -101,15 +275,10 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group ...@@ -101,15 +275,10 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group
engine_values = df_grouped[compare_col].unique() engine_values = df_grouped[compare_col].unique()
n_engines = len(engine_values) n_engines = len(engine_values)
# 设置配色
color_palette = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#3B1F2B', '#95C623', '#7B2D26'] color_palette = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#3B1F2B', '#95C623', '#7B2D26']
colors = [color_palette[i % len(color_palette)] for i in range(n_engines)] colors = [color_palette[i % len(color_palette)] for i in range(n_engines)]
#seaborn_pastel = ['#a1c9f4', '#ffb482', '#8de5a1', '#ff9f9b', '#d0bbff', '#debb9b', '#fab0e4', '#cfcfcf', '#fffea3', '#b9f2f0' ] normalized_cols = [c['column'] for c in (normalize_configs or [])]
#colors = [seaborn_pastel[i % len(seaborn_pastel)] for i in range(n_engines)]
#seaborn_default = ['#4c72b0', '#dd8452', '#55a868', '#c44e52', '#8172b3', '#937860', '#da8bc3', '#8c8c8c', '#ccb974', '#64b5cd' ]
#colors = [seaborn_default[i % len(seaborn_default)] for i in range(n_engines)]
if merge_groups and n_outer > 1: if merge_groups and n_outer > 1:
fig, axes = plt.subplots(1, 4, figsize=(8 * n_outer + 20, 10)) fig, axes = plt.subplots(1, 4, figsize=(8 * n_outer + 20, 10))
...@@ -158,6 +327,10 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group ...@@ -158,6 +327,10 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group
for bar, val in zip(bars, values): for bar, val in zip(bars, values):
if val > 0: if val > 0:
y_pos = bar.get_height() + bar.get_height()*0.02 if bar.get_height() > 0 else 1 y_pos = bar.get_height() + bar.get_height()*0.02 if bar.get_height() > 0 else 1
if metric in normalized_cols:
ax.text(bar.get_x() + bar.get_width()/2, y_pos,
f'{val:.0f}%', ha='center', va='bottom', fontsize=5, fontweight='bold')
else:
ax.text(bar.get_x() + bar.get_width()/2, y_pos, ax.text(bar.get_x() + bar.get_width()/2, y_pos,
f'{val:.1f}', ha='center', va='bottom', fontsize=5, fontweight='bold') f'{val:.1f}', ha='center', va='bottom', fontsize=5, fontweight='bold')
...@@ -183,6 +356,10 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group ...@@ -183,6 +356,10 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group
ax.set_xticklabels(inner_labels, rotation=45, ha='right', fontsize=6) ax.set_xticklabels(inner_labels, rotation=45, ha='right', fontsize=6)
ax.set_xlabel('/'.join(inner_group_cols), fontsize=9) ax.set_xlabel('/'.join(inner_group_cols), fontsize=9)
if metric in normalized_cols:
ax.set_ylabel(f'{metric} (%)', fontsize=10)
ax.set_title(f'{metric} (归一化)', fontsize=12, fontweight='bold')
else:
ax.set_ylabel(metric, fontsize=10) ax.set_ylabel(metric, fontsize=10)
ax.set_title(f'{metric}', fontsize=12, fontweight='bold') ax.set_title(f'{metric}', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3, linestyle='--') ax.grid(axis='y', alpha=0.3, linestyle='--')
...@@ -225,10 +402,18 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group ...@@ -225,10 +402,18 @@ def generate_chart(df_subset, output_path, colkey, outer_group_cols, inner_group
for bar, val in zip(bars, values): for bar, val in zip(bars, values):
if val > 0: if val > 0:
y_pos = bar.get_height() + bar.get_height()*0.02 if bar.get_height() > 0 else 1 y_pos = bar.get_height() + bar.get_height()*0.02 if bar.get_height() > 0 else 1
if metric in normalized_cols:
ax.text(bar.get_x() + bar.get_width()/2, y_pos,
f'{val:.0f}%', ha='center', va='bottom', fontsize=7, fontweight='bold')
else:
ax.text(bar.get_x() + bar.get_width()/2, y_pos, ax.text(bar.get_x() + bar.get_width()/2, y_pos,
f'{val:.1f}', ha='center', va='bottom', fontsize=7, fontweight='bold') f'{val:.1f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
ax.set_xlabel('/'.join(inner_group_cols), fontsize=9) ax.set_xlabel('/'.join(inner_group_cols), fontsize=9)
if metric in normalized_cols:
ax.set_ylabel(f'{metric} (%)', fontsize=10)
ax.set_title(f'{outer_label}={outer_label_value} - {metric} (归一化)', fontsize=11, fontweight='bold')
else:
ax.set_ylabel(metric, fontsize=10) ax.set_ylabel(metric, fontsize=10)
ax.set_title(f'{outer_label}={outer_label_value} - {metric}', fontsize=11, fontweight='bold') ax.set_title(f'{outer_label}={outer_label_value} - {metric}', fontsize=11, fontweight='bold')
ax.set_xticks(x) ax.set_xticks(x)
...@@ -307,6 +492,12 @@ metric_cols = config.get('metric_cols', [ ...@@ -307,6 +492,12 @@ metric_cols = config.get('metric_cols', [
'总吞吐量(tokens/s)' '总吞吐量(tokens/s)'
]) ])
normal_metrics, normalize_configs = parse_metric_cols(metric_cols, key_cols)
all_metric_cols = normal_metrics + [c['column'] for c in normalize_configs]
print(f"\n普通指标: {normal_metrics}")
print(f"归一化配置: {normalize_configs}")
dist_combinations = df_renamed.groupby(dist_cols).size().reset_index() dist_combinations = df_renamed.groupby(dist_cols).size().reset_index()
print(f"\n将生成 {len(dist_combinations)} 个图表...") print(f"\n将生成 {len(dist_combinations)} 个图表...")
...@@ -326,14 +517,28 @@ for idx, (_, dist_row) in enumerate(dist_combinations.iterrows()): ...@@ -326,14 +517,28 @@ for idx, (_, dist_row) in enumerate(dist_combinations.iterrows()):
output_filename = '_'.join(filter_parts) + ".png" output_filename = '_'.join(filter_parts) + ".png"
output_path = os.path.join(args.输出目录, output_filename) output_path = os.path.join(args.输出目录, output_filename)
print(f"[{idx+1}/{len(dist_combinations)}] 生成图表: {output_filename}") orig_count = len(df_subset)
for c in metric_cols: df_grouped = df_subset.groupby(key_cols).size().reset_index(name='count')
df_subset[c] = pd.to_numeric(df_subset[c], errors='coerce').fillna(0) grouped_count = len(df_grouped)
#try: print(f"[{idx+1}/{len(dist_combinations)}] {output_filename}: 原始{orig_count}行 -> 分组后{grouped_count}组")
# df_subset[c] = df_subset[c].astype('float64') print(f" 分组详情: {df_grouped[key_cols].values.tolist()}")
#except Exception as e:
# print(f"数据转换错误, 列名{c}, 错误信息{e}") for c in all_metric_cols:
success = generate_chart(df_subset, output_path, key_cols, outer_group, inner_group, metric_cols, args.合并分组) if c in df_subset.columns:
numeric_vals = pd.to_numeric(df_subset[c], errors='coerce')
if numeric_vals.notna().any():
sample = numeric_vals.dropna().iloc[0]
if isinstance(sample, (int, np.integer)) and not pd.isna(numeric_vals).any():
df_subset[c] = numeric_vals
else:
df_subset[c] = numeric_vals.fillna(0)
if normalize_configs:
group_cols = outer_group + inner_group
df_subset = apply_normalization(df_subset, key_cols, normalize_configs, group_cols)
success = generate_chart(df_subset, output_path, key_cols, outer_group, inner_group, all_metric_cols,
normalize_configs=normalize_configs, merge_groups=args.合并分组)
if success: if success:
chart_count += 1 chart_count += 1
......
{
"filter": {
"并发数": [],
"模型": []
},
"dist_cols": ["model_name", "DCU nums"],
"group_cols": ["bs", "input_len", "output_len"],
"key_cols": ["option"],
"metric_cols": ["TTFT_mean(ms)", "TPOT_mean(ms)", "GenerateThroughput(tokens/s)", "TotalThroughput(tokens/s)"],
"files": [
{
"file": "weichai.xlsx",
"sheets": [],
"column_mapping": {
},
"column_replace": {
},
"column_add": {
}
}
]
}
{
"filter": {
"并发数": [],
"模型": []
},
"dist_cols": ["model_name", "DCU nums"],
"group_cols": ["bs", "input_len", "output_len"],
"key_cols": ["DCU", "option"],
"metric_cols": ["TTFT_mean(ms)", {"TPOT_mean(ms)": ["BW1100", "NPC"]}],
"files": [
{
"file": "weichai.xlsx",
"sheets": [],
"column_mapping": {
},
"column_replace": {
},
"column_add": {
}
}
]
}
File added
File added
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment