parse_logs.py 8.13 KB
Newer Older
wangkx1's avatar
init  
wangkx1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import os
import glob
import csv
import sys
from pathlib import Path
from collections import defaultdict


def parse_log_file(log_file_path):
    """
    解析单个日志文件,提取关键信息
    
    参数:
        log_file_path: 日志文件路径
        
    返回:
        字典包含解析结果,或None表示解析失败
    """
    try:
        with open(log_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        result = {
            'model_name': None,
            'batch_size': None,
            'input_shape': None,
            'FPS': None
        }
        
        # 1. 提取模型名称:从日志命令中提取
        # 查找 "perf resnet50.onnx" 这样的模式
        model_match = re.search(r'[/\s]([a-zA-Z0-9_-]+)\.onnx', content)
        if model_match:
            result['model_name'] = model_match.group(1)
        else:
            # 如果没找到,尝试从文件名提取
            filename = os.path.basename(log_file_path)
            # 匹配类似 "resnet50-4batch.log" 的文件名
            filename_match = re.search(r'([a-zA-Z0-9_-]+)-\d+batch\.log', filename)
            if filename_match:
                result['model_name'] = filename_match.group(1)
        
        # 2. 提取batch size:从"Batch size: 1"中提取
        batch_match = re.search(r'Batch size:\s*(\d+)', content)
        if batch_match:
            result['batch_size'] = batch_match.group(1)
        
        # 3. 提取输入shape:从"--input-dim @input 1 3 224 224"中提取
        input_match = re.search(r'--input-dim\s+(?:@)?[a-zA-Z0-9_\-\.]+\s+(\d+\s+\d+\s+\d+\s+\d+)', content)
        if input_match:
            result['input_shape'] = input_match.group(1)
        
        # 4. 提取总时间:从"Total time: 2.08637ms"中提取
        # time_match = re.search(r'Total time:\s*([\d.]+)ms', content)
        # if time_match:
        #     result['total_time'] = time_match.group(1)

        # 优先尝试提取 Rate 行中的数值
        rate_match = re.search(r'Rate:\s*([\d.]+)\s*inferences/sec', content)
        if rate_match:
            result['FPS'] = rate_match.group(1)  # 注意:这里实际存的是速率,不是时间
        else:
            # 回退:从 Total time 提取(如果需要保留原逻辑)
            rate_match = re.search(r'Total time:\s*([\d.]+)ms', content)
            if rate_match:
                result['FPS'] = rate_match.group(1)

        # 优先尝试提取 Rate 行中的数值
        # 尝试提取“最大使用: XXXX MiB”中的数值
        memory_match = re.search(r'最大使用:\s*([\d.]+)\s*MiB', content)
        if memory_match:
            result['MaxMemoryUsageMiB'] = memory_match.group(1)
        else:
            result['MaxMemoryUsageMiB'] = "null"

        # 优先尝试提取 Rate 行中的数值
        peak_usage_match = re.search(r'峰值使用率:\s*([\d.]+)%', content)
        if peak_usage_match:
            result['HCU%'] = peak_usage_match.group(1)
        else:
            # 回退:从 Total time 提取(如果需要保留原逻辑)
            peak_usage_match = re.search(r'峰值使用率:\s*([\d.]+)%', content)
            if peak_usage_match:
                result['HCU%'] = peak_usage_match.group(1)
        
        # 检查是否成功提取了所有必要信息
        if all(result.values()):
            return result
        else:
            print(f"警告: 文件 {log_file_path} 解析不完整:")
            print(f"  模型名: {result['model_name']}")
            print(f"  Batch: {result['batch_size']}")
            print(f"  输入shape: {result['input_shape']}")
            print(f"  FPS: {result['FPS']}")
            print(f"  MaxMemoryUsageMiB: {result['MaxMemoryUsageMiB']}")
            print(f"  HCU%: {result['HCU%']}")
            return None
            
    except Exception as e:
        print(f"错误: 读取或解析文件 {log_file_path} 时出错: {e}")
        return None

def main():
    """主函数"""
    # 解析命令行参数
    if len(sys.argv) > 1:
        log_pattern = sys.argv[1]
    else:
        log_pattern = "*.log"  # 默认匹配所有log文件
    
    if len(sys.argv) > 2:
        csv_file = sys.argv[2]
    else:
        csv_file = "result.csv"  # 默认输出文件名
    
    print(f"开始解析日志文件,模式: {log_pattern}")
    print(f"输出到: {csv_file}")
    print("=" * 50)
    
    # 获取所有匹配的日志文件
    log_files = glob.glob(log_pattern)
    
    if not log_files:
        print(f"错误: 没有找到匹配 '{log_pattern}' 的日志文件")
        return
    
    print(f"找到 {len(log_files)} 个日志文件")
    
    # 检查CSV文件是否存在,决定是否需要写表头
    write_header = not os.path.exists(csv_file)
    
    # 解析所有日志文件
    results = []
    for log_file in sorted(log_files):
        print(f"正在解析: {log_file}")
        result = parse_log_file(log_file)
        if result:
            results.append(result)
            print(f"  ✓ 成功解析: {result['model_name']} (batch={result['batch_size']})")
    
    if not results:
        print("错误: 没有成功解析任何日志文件")
        return
    
    # 写入CSV文件
    try:
        with open(csv_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['model_name', 'batch_size', 'input_shape', 'FPS', "MaxMemoryUsageMiB", "HCU%"])
            
            if write_header:
                writer.writeheader()
                print(f"创建新的CSV文件: {csv_file}")
            
            # 写入数据,检查是否已存在相同的记录
            existing_data = []
            if os.path.exists(csv_file) and os.path.getsize(csv_file) > 0:
                with open(csv_file, 'r', encoding='utf-8') as existing_f:
                    existing_reader = csv.DictReader(existing_f)
                    existing_data = list(existing_reader)

            # 对结果按 model_name 字符串升序,batch_size 数值升序排序
            results.sort(key=lambda x: (x['model_name'], int(x['batch_size'])), reverse=False)
            
            new_count = 0
            for result in results:
                # 检查是否已存在相同模型、相同batch、相同输入的记录
                is_duplicate = False
                for existing in existing_data:
                    if (existing.get('model_name') == result['model_name'] and
                        existing.get('batch_size') == result['batch_size'] and
                        existing.get('input_shape') == result['input_shape']):
                        is_duplicate = True
                        print(f"  ⚠ 跳过重复记录: {result['model_name']}, batch={result['batch_size']}")
                        break
                
                if not is_duplicate:
                    writer.writerow({
                        'model_name': result['model_name'],
                        'batch_size': result['batch_size'],
                        'input_shape': result['input_shape'],
                        'FPS': result['FPS'],
                        'MaxMemoryUsageMiB': result['MaxMemoryUsageMiB'],
                        'HCU%': result['HCU%']
                    })
                    new_count += 1

        print(f"\n" + "=" * 50)
        print("解析完成!")
        print(f"处理日志文件数: {len(log_files)}")
        print(f"成功解析文件数: {len(results)}")
        print(f"新增记录数: {new_count}")
        print(f"CSV文件: {csv_file}")
        
        # 显示CSV文件内容预览
        print("\nCSV内容预览:")
        print("-" * 50)
        try:
            with open(csv_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                for i, line in enumerate(lines[:20]):  # 显示前20行
                    print(line.rstrip())
                if len(lines) > 20:
                    print(f"... (共 {len(lines)} 行)")
        except Exception as e:
            print(f"读取CSV文件时出错: {e}")
        
    except Exception as e:
        print(f"错误: 写入CSV文件时出错: {e}")

if __name__ == "__main__":
    main()