check_soruce.py 5.17 KB
Newer Older
lishj6's avatar
init  
lishj6 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import subprocess
import json
import time
import sys

def run_hy_smi():
    """运行hy-smi命令并返回JSON格式的输出"""
    try:
        # 使用JSON格式获取数据,提高解析可靠性
        result = subprocess.run(
            ['hy-smi', '--showtemp', '--showpower', '--showuse', '--showmemuse', '--json'],
            capture_output=True,
            text=True,
            timeout=10
        )
        if result.returncode != 0:
            print(f"命令执行错误: {result.stderr}")
            return None
        return json.loads(result.stdout)
    except subprocess.TimeoutExpired:
        print("命令执行超时")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {e}")
        return None
    except Exception as e:
        print(f"意外错误: {e}")
        return None

def parse_data(data):
    """解析hy-smi输出数据"""
    metrics = []
    
    # 根据实际JSON结构提取数据
    for card_key, card_data in data.items():
        try:
            # 使用结温作为温度指标
            temperature = float(card_data.get("Temperature (Sensor junction) (C)", 0))
            # 提取功耗
            power = float(card_data.get("Average Graphics Package Power (W)", 0))
            # 提取利用率
            utilization = float(card_data.get("HCU use (%)", 0))
            # 提取内存使用百分比
            mem_usage_percent = float(card_data.get("HCU memory use (%)", 0))
            
            metric = {
                'card': card_key,
                'temperature': temperature,
                'power': power,
                'utilization': utilization,
                'mem_usage_percent': mem_usage_percent
            }
            metrics.append(metric)
        except (ValueError, TypeError) as e:
            print(f"解析数据时出错: {e}")
            continue
    return metrics

def main():
    """主函数"""
    print("开始收集DCU指标,按Ctrl+C停止...")
    print("{:<10} {:<8} {:<10} {:<10} {:<15} {:<15}".format(
        "时间", "卡号", "温度(℃)", "功耗(W)", "利用率(%)", "内存使用(%)"))
    
    data_points = []
    try:
        while True:
            raw_data = run_hy_smi()
            if raw_data:
                current_metrics = parse_data(raw_data)
                if current_metrics:
                    for metric in current_metrics:
                        print("{:<10} {:<8} {:<10.1f} {:<10.1f} {:<15.1f} {:<15.0f}".format(
                            time.strftime("%H:%M:%S"),
                            metric['card'],
                            metric['temperature'],
                            metric['power'],
                            metric['utilization'],
                            metric['mem_usage_percent']
                        ))
                    data_points.extend(current_metrics)
            
            time.sleep(1)
            
    except KeyboardInterrupt:
        print("\n正在计算平均值...")
        
        if data_points:
            # 计算所有卡的平均值
            avg_temp = sum(m['temperature'] for m in data_points) / len(data_points)
            avg_power = sum(m['power'] for m in data_points) / len(data_points)
            avg_util = sum(m['utilization'] for m in data_points) / len(data_points)
            avg_mem_percent = sum(m['mem_usage_percent'] for m in data_points) / len(data_points)
            
            print("\n所有卡的平均值统计:")
            print("平均温度: {:.1f}℃".format(avg_temp))
            print("平均功耗: {:.1f}W".format(avg_power))
            print("平均利用率: {:.1f}%".format(avg_util))
            print("平均内存使用: {:.1f}%".format(avg_mem_percent))
            
            # 计算每张卡的平均值
            card_stats = {}
            for metric in data_points:
                card = metric['card']
                if card not in card_stats:
                    card_stats[card] = {'temp': [], 'power': [], 'util': [], 'mem': []}
                
                card_stats[card]['temp'].append(metric['temperature'])
                card_stats[card]['power'].append(metric['power'])
                card_stats[card]['util'].append(metric['utilization'])
                card_stats[card]['mem'].append(metric['mem_usage_percent'])
            
            print("\n每张卡的平均值统计:")
            print("平均温度: {:.1f}℃".format(avg_temp))
            print("平均功耗: {:.1f}W".format(avg_power))
            print("平均利用率: {:.1f}%".format(avg_util))
            print("平均内存使用: {:.1f}%".format(avg_mem_percent))
            for card, stats in card_stats.items():
                avg_card_temp = sum(stats['temp']) / len(stats['temp'])
                avg_card_power = sum(stats['power']) / len(stats['power'])
                avg_card_util = sum(stats['util']) / len(stats['util'])
                avg_card_mem = sum(stats['mem']) / len(stats['mem'])
                
                print("{:<8} {:<10.1f} {:<10.1f} {:<15.1f} {:<15.1f}".format(
                    card, avg_card_temp, avg_card_power, avg_card_util, avg_card_mem))
        else:
            print("未收集到有效数据")

if __name__ == "__main__":
    main()