init version

9afddf86 · sharkgene@qq.com · 089c9e5a · 9afddf86 · 9afddf86 · 9afddf86
Commit 9afddf86 authored Mar 16, 2026 by sharkgene@qq.com
Hide whitespace changes
Inline Side-by-side

Showing with 376 additions and 0 deletions

README.md README.md +40 -0

data_config.json data_config.json +19 -0

plot_comparison.py plot_comparison.py +317 -0

No files found.
--- a/README.md
+++ b/README.md
 # DataAnalysis
+将大模型推理测试生成的excel数据文件，按照特定方式生成柱状图进行比较

+配置文件data_config.json示例说明:
+{
+  "filter": {
+    "并发数": [],
+    "模型": []
+
+  },
+  "distinguish": ["模型", "卡类型", "卡数"],
+  "group_by": [["并发数"], ["输入长度(tokens)", "输出长度(tokens)"]],
+  "files": [
+    {
+      "file": "test100.xlsx",
+      "sheets": [],
+      "column_mapping": {
+      },
+      "column_replace": {
+      }
+    }
+  ]
+}
+
+filter: 过滤条件，不设置或者为空则不过滤
+distinguish: 选择输出文件的条件，例如["模型", "卡类型", "卡数"]则将相同的模型、卡类型和卡数的所有数据生成以个文件
+group_by: 可以进行最多2重分组，每层一个图，第2层以不同柱状显示数据。如果命令行参数指定合并，则所有合成一个图，分组之间间隔开
+files: 指定文件信息
+
+## 使用方法
+python3 plot_comparison.py --help
+usage: plot_comparison.py [-h] [--配置 配置] [--输出目录 输出目录] [--合并分组]
+
+绘制模型性能对比图表
+
+options:
+  -h, --help            show this help message and exit
+  --配置 配置, -f 配置        数据配置文件路径
+  --输出目录 输出目录, -d 输出目录  输出图表目录
+  --合并分组, -m            将第一层分组合并到一张图中
+
+## 
--- a/data_config.json
+++ b/data_config.json
+{
+  "filter": {
+    "并发数": [],
+    "模型": []
+
+  },
+  "distinguish": ["模型", "卡类型", "卡数"],
+  "group_by": [["并发数"], ["输入长度(tokens)", "输出长度(tokens)"]],
+  "files": [
+    {
+      "file": "test100.xlsx",
+      "sheets": [],
+      "column_mapping": {
+      },
+      "column_replace": {
+      }
+    }
+  ]
+}
--- a/plot_comparison.py
+++ b/plot_comparison.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+import numpy as np
+import argparse
+import json
+import os
+
+plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
+plt.rcParams['axes.unicode_minus'] = False
+
+parser = argparse.ArgumentParser(description='绘制模型性能对比图表')
+parser.add_argument('--配置', '-f', type=str, default='data_config.json', help='数据配置文件路径')
+parser.add_argument('--输出目录', '-d', type=str, default='charts', help='输出图表目录')
+parser.add_argument('--合并分组', '-m', action='store_true', help='将第一层分组合并到一张图中')
+args = parser.parse_args()
+
+def load_data_from_files(config):
+    all_data = []
+    files_config = config.get('files', [])
+    
+    for file_config in files_config:
+        file_path = file_config.get('file')
+        sheets = file_config.get('sheets', [])
+        column_mapping = file_config.get('column_mapping', {})
+        
+        if not os.path.exists(file_path):
+            print(f"文件不存在: {file_path}, 跳过")
+            continue
+        
+        xl = pd.ExcelFile(file_path)
+        
+        if sheets is None or (isinstance(sheets, list) and len(sheets) == 0):
+            sheets = xl.sheet_names
+        else:
+            sheets = [s for s in sheets if s]
+        
+        for sheet in sheets:
+            try:
+                df = pd.read_excel(file_path, sheet_name=sheet)
+                df.columns = df.columns.str.replace('\n', '').str.strip()
+                
+                if column_mapping:
+                    df = df.rename(columns=column_mapping)
+                
+                column_replace = file_config.get('column_replace', {})
+                for col, replace_dict in column_replace.items():
+                    if col in df.columns:
+                        df[col] = df[col].replace(replace_dict)
+                
+                df['source_file'] = file_path
+                df['source_sheet'] = sheet
+                all_data.append(df)
+                print(f"读取: {file_path} - {sheet}, {len(df)} 行")
+            except Exception as e:
+                print(f"读取失败: {file_path} - {sheet}: {e}")
+    
+    if not all_data:
+        return pd.DataFrame()
+    
+    combined_df = pd.concat(all_data, ignore_index=True)
+    return combined_df
+
+def apply_filter(df, filter_dict):
+    for filter_col, filter_values in filter_dict.items():
+        if filter_col in df.columns and filter_values:
+            if isinstance(filter_values, list):
+                df = df[df[filter_col].isin(filter_values)]
+            else:
+                df = df[df[filter_col] == filter_values]
+    return df
+
+def generate_chart(df_subset, output_path, compare_col, outer_group_cols, inner_group_cols, metric_cols, merge_groups=False):
+    df_subset = df_subset.copy()
+    df_subset[compare_col] = df_subset['vLLM版本'].astype(str) + '_' + df_subset['V0/V1 Engine'].astype(str)
+    
+    all_group_cols = outer_group_cols + inner_group_cols
+    if all_group_cols:
+        df_grouped = df_subset[all_group_cols + [compare_col] + metric_cols].groupby(all_group_cols + [compare_col]).mean().reset_index()
+    else:
+        df_grouped = df_subset[[compare_col] + metric_cols].groupby([compare_col]).mean().reset_index()
+        df_grouped[compare_col] = df_grouped.index
+    
+    if len(df_grouped) == 0:
+        print(f"    无数据，跳过")
+        return False
+    
+    if outer_group_cols:
+        outer_values = df_grouped.groupby(outer_group_cols).size().reset_index()
+    else:
+        outer_values = pd.DataFrame({'': ['all']})
+    
+    n_outer = len(outer_values)
+    engine_values = df_grouped[compare_col].unique()
+    n_engines = len(engine_values)
+    
+    color_palette = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#3B1F2B', '#95C623', '#7B2D26']
+    colors = [color_palette[i % len(color_palette)] for i in range(n_engines)]
+    
+    if merge_groups and n_outer > 1:
+        fig, axes = plt.subplots(1, 4, figsize=(8 * n_outer + 20, 10))
+        
+        bar_width = 0.12
+        bar_spacing = 0.05
+        group_gap = 3
+        
+        x_labels_all = None
+        
+        for col, metric in enumerate(metric_cols):
+            ax = axes[col]
+            
+            current_x = 0
+            
+            for row_idx, (_, outer_row) in enumerate(outer_values.iterrows()):
+                df_outer = df_grouped.copy()
+                for gcol in outer_group_cols:
+                    df_outer = df_outer[df_outer[gcol] == outer_row[gcol]]
+                
+                outer_label_value = '-'.join([str(outer_row[gcol]) for gcol in outer_group_cols])
+                
+                pt = df_outer.pivot_table(
+                    index=inner_group_cols, 
+                    columns=compare_col, 
+                    values=metric
+                ).fillna(0)
+                
+                n_bars_per_group = len(pt)
+                group_width = n_bars_per_group * n_engines * (bar_width + bar_spacing) + group_gap
+                group_center = current_x + group_width / 2
+                
+                x_labels = ['/'.join([str(v) for v in idx]) for idx in pt.index]
+                if x_labels_all is None:
+                    x_labels_all = x_labels
+                
+                x = np.arange(len(x_labels)) * (n_engines * (bar_width + bar_spacing)) + current_x
+                
+                for i, engine in enumerate(engine_values):
+                    if engine in pt.columns:
+                        values = pt[engine].values
+                        offset = i * bar_width
+                        label = f"{engine} ({outer_label_value})"
+                        bars = ax.bar(x + offset, values, bar_width, label=label, color=colors[i], edgecolor='white', linewidth=0.5)
+                        
+                        for bar, val in zip(bars, values):
+                            if val > 0:
+                                y_pos = bar.get_height() + bar.get_height()*0.02 if bar.get_height() > 0 else 1
+                                ax.text(bar.get_x() + bar.get_width()/2, y_pos, 
+                                        f'{val:.1f}', ha='center', va='bottom', fontsize=5, fontweight='bold')
+                
+                ax.axvline(x=current_x + n_bars_per_group * n_engines * (bar_width + bar_spacing) + group_gap/2, color='gray', linestyle='--', linewidth=1)
+                
+                ax.text(group_center, ax.get_ylim()[1] * 0.95, outer_label_value, 
+                        ha='center', va='top', fontsize=9, fontweight='bold', 
+                        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+                
+                current_x = current_x + n_bars_per_group * n_engines * (bar_width + bar_spacing) + group_gap
+            
+            total_inner_labels = len(x_labels_all)
+            inner_positions = []
+            inner_labels = []
+            for gi in range(len(outer_values)):
+                base_x = gi * (total_inner_labels * n_engines * (bar_width + bar_spacing) + group_gap)
+                for xi in range(total_inner_labels):
+                    center_pos = base_x + xi * n_engines * (bar_width + bar_spacing) + (n_engines * bar_width + (n_engines-1) * bar_spacing) / 2
+                    inner_positions.append(center_pos)
+                    inner_labels.append(x_labels_all[xi])
+            
+            ax.set_xticks(inner_positions)
+            ax.set_xticklabels(inner_labels, rotation=45, ha='right', fontsize=6)
+            
+            ax.set_xlabel('/'.join(inner_group_cols), fontsize=9)
+            ax.set_ylabel(metric, fontsize=10)
+            ax.set_title(f'{metric}', fontsize=12, fontweight='bold')
+            ax.grid(axis='y', alpha=0.3, linestyle='--')
+            ax.legend(fontsize=5, loc='upper right', framealpha=0.9, ncol=1)
+    else:
+        fig, axes = plt.subplots(n_outer, 4, figsize=(24, 5 * n_outer))
+        
+        if n_outer == 1:
+            axes = axes.reshape(1, -1)
+        
+        bar_width = 0.2
+        
+        outer_label = '/'.join(outer_group_cols) if outer_group_cols else '全部'
+        
+        for row_idx, (_, outer_row) in enumerate(outer_values.iterrows()):
+            df_outer = df_grouped.copy()
+            for col in outer_group_cols:
+                df_outer = df_outer[df_outer[col] == outer_row[col]]
+            
+            outer_label_value = '-'.join([str(outer_row[col]) for col in outer_group_cols])
+            
+            for col, metric in enumerate(metric_cols):
+                ax = axes[row_idx, col]
+                
+                pt = df_outer.pivot_table(
+                    index=inner_group_cols, 
+                    columns=compare_col, 
+                    values=metric
+                ).fillna(0)
+                
+                x_labels = ['/'.join([str(v) for v in idx]) for idx in pt.index]
+                x = np.arange(len(x_labels))
+                
+                for i, engine in enumerate(engine_values):
+                    if engine in pt.columns:
+                        values = pt[engine].values
+                        offset = (i - n_engines/2 + 0.5) * bar_width
+                        bars = ax.bar(x + offset, values, bar_width, label=engine, color=colors[i], edgecolor='white', linewidth=0.5)
+                        
+                        for bar, val in zip(bars, values):
+                            if val > 0:
+                                y_pos = bar.get_height() + bar.get_height()*0.02 if bar.get_height() > 0 else 1
+                                ax.text(bar.get_x() + bar.get_width()/2, y_pos, 
+                                        f'{val:.1f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
+                
+                ax.set_xlabel('/'.join(inner_group_cols), fontsize=9)
+                ax.set_ylabel(metric, fontsize=10)
+                ax.set_title(f'{outer_label}={outer_label_value} - {metric}', fontsize=11, fontweight='bold')
+                ax.set_xticks(x)
+                ax.set_xticklabels(x_labels, rotation=45, ha='right', fontsize=7)
+                ax.grid(axis='y', alpha=0.3, linestyle='--')
+                
+                ax.legend(fontsize=6, loc='upper right', framealpha=0.9, ncol=1)
+    
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
+    plt.close()
+    return True
+
+print(f"从配置文件加载数据: {args.配置}")
+
+with open(args.配置, 'r', encoding='utf-8') as f:
+    config = json.load(f)
+
+df = load_data_from_files(config)
+
+if df.empty:
+    print("未加载到数据")
+    exit(1)
+
+print(f"\n可用列名: {df.columns.tolist()}")
+
+col_mapping = {}
+for std_col, alt_cols in [
+    ('模型', ['模型', 'model', 'Model']),
+    ('卡类型', ['卡类型', 'card_type', '卡']),
+    ('卡数', ['卡数', 'num_cards', '卡数', 'GPU数量']),
+    ('vLLM版本', ['vLLM版本', 'vllm_version', 'vLLM版本']),
+    ('V0/V1 Engine', ['V0/V1 Engine', 'Engine', 'engine']),
+    ('输入长度(tokens)', ['输入长度(tokens)', 'input_length', 'input length', '输入长度']),
+    ('输出长度(tokens)', ['输出长度(tokens)', 'output_length', 'output length', '输出长度']),
+    ('并发数', ['并发数', 'concurrency', '并发', 'num_concurrent']),
+    ('平均首字延时TTFT(ms)', ['平均首字延时TTFT(ms)', 'ttft', 'TTFT', '首字延时']),
+    ('平均生成时间TPOT(ms)', ['平均生成时间TPOT(ms)', 'tpot', 'TPOT', '生成时间']),
+    ('生成吞吐量(tokens/s)', ['生成吞吐量(tokens/s)', 'gen_throughput', '生成吞吐']),
+    ('总吞吐量(tokens/s)', ['总吞吐量(tokens/s)', 'total_throughput', '总吞吐'])
+]:
+    for alt in alt_cols:
+        if alt in df.columns:
+            col_mapping[std_col] = alt
+
+print(f"\n列映射: {col_mapping}")
+df_renamed = df.rename(columns=col_mapping)
+
+filter_config = config.get('filter', {})
+df_renamed = apply_filter(df_renamed, filter_config)
+
+print(f"过滤后数据量: {len(df_renamed)}")
+
+metric_cols = [
+    '平均首字延时TTFT(ms)', 
+    '平均生成时间TPOT(ms)', 
+    '生成吞吐量(tokens/s)', 
+    '总吞吐量(tokens/s)'
+]
+
+dist_cols_config = config.get('distinguish', ['模型', '卡数'])
+dist_cols = [col_mapping.get(c, c) for c in dist_cols_config]
+dist_cols = [c for c in dist_cols if c in df_renamed.columns]
+
+os.makedirs(args.输出目录, exist_ok=True)
+
+group_by = config.get('group_by', [[], []])
+if isinstance(group_by[0], list):
+    outer_group = group_by[0] if len(group_by) > 0 else []
+    inner_group = group_by[1] if len(group_by) > 1 else []
+else:
+    outer_group = []
+    inner_group = group_by
+
+dist_combinations = df_renamed.groupby(dist_cols).size().reset_index()
+print(f"\n将生成 {len(dist_combinations)} 个图表...")
+
+chart_count = 0
+
+for idx, (_, dist_row) in enumerate(dist_combinations.iterrows()):
+    df_subset = df_renamed.copy()
+    for dist_col in dist_cols:
+        df_subset = df_subset[df_subset[dist_col] == dist_row[dist_col]]
+    
+    filter_parts = []
+    for dist_col in dist_cols:
+        val = dist_row[dist_col]
+        safe_col_name = dist_col.replace('/', '_').replace('\\', '_')[:10]
+        filter_parts.append(f"{safe_col_name}_{val}")
+    
+    output_filename = '_'.join(filter_parts) + ".png"
+    output_path = os.path.join(args.输出目录, output_filename)
+    
+    print(f"[{idx+1}/{len(dist_combinations)}] 生成图表: {output_filename}")
+    success = generate_chart(df_subset, output_path, 'vLLM_Engine', outer_group, inner_group, metric_cols, args.合并分组)
+    
+    if success:
+        chart_count += 1
+
+print(f"\n完成！共生成 {chart_count} 个图表，保存到目录: {args.输出目录}")