monitor.py 4.03 KB
Newer Older
wangkx1's avatar
init  
wangkx1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# scripts/monitor.py
import subprocess
import threading
import time
from datetime import datetime
from typing import Optional

class MemoryMonitor:
    """显存使用监控器"""
    
    def __init__(self, device_id: int, log_file: str = "memory_simple.log"):
        self.device_id = device_id
        self.log_file = log_file
        self.monitoring = False
        self.monitor_thread: Optional[threading.Thread] = None
        self.total_memory: Optional[int] = None
        
    def get_total_memory(self) -> Optional[int]:
        """获取总显存"""
        try:
            cmd = f"hy-smi -d {self.device_id} --showmeminfo vram --showuse"
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            
            for line in result.stdout.split('\n'):
                if "vram Total Memory" in line:
                    parts = line.split(':')
                    if len(parts) >= 3:
                        memory_str = parts[2].strip().split()[0]
                        return int(memory_str)
        except Exception as e:
            print(f"获取总显存失败: {e}")
            
        return None
    
    def monitor_memory(self):
        """监控显存使用"""
        with open(self.log_file, 'w') as f:
            f.write("")  # 清空文件
            
        while self.monitoring:
            try:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                cmd = f"hy-smi -d {self.device_id} --showmeminfo vram --showuse"
                result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
                
                used_memory = None
                used_percent = None
                
                for line in result.stdout.split('\n'):
                    if "vram Total Used Memory" in line:
                        parts = line.split(':')
                        if len(parts) >= 3:
                            used_memory = parts[2].strip().split()[0]
                    elif "HCU use" in line:
                        parts = line.split(':')
                        if len(parts) >= 3:
                            used_percent = parts[2].strip().split()[0]
                
                if used_memory and used_percent:
                    with open(self.log_file, 'a') as f:
                        f.write(f"{timestamp} {used_memory} {used_percent}\n")
                        
            except Exception as e:
                print(f"监控出错: {e}")
                
            time.sleep(1)
    
    def start(self):
        """开始监控"""
        self.total_memory = self.get_total_memory()
        
        if self.total_memory:
            print(f"总显存: {self.total_memory} MiB")
        
        print("开始监控显存使用...")
        
        self.monitoring = True
        self.monitor_thread = threading.Thread(target=self.monitor_memory)
        self.monitor_thread.start()
        
        return self.total_memory
    
    def stop(self):
        """停止监控"""
        self.monitoring = False
        if self.monitor_thread:
            self.monitor_thread.join(timeout=2)
    
    def get_statistics(self) -> dict[str, any]:
        """获取统计信息"""
        stats = {
            "total_memory": self.total_memory,
            "max_used": 0,
            "max_percent": 0
        }
        
        try:
            with open(self.log_file, 'r') as f:
                lines = f.readlines()
                
            if lines:
                # 提取最大使用量和最大使用率
                used_values = [float(line.split()[2]) for line in lines if len(line.split()) >= 3]
                percent_values = [float(line.split()[3]) for line in lines if len(line.split()) >= 4]
                
                if used_values:
                    stats["max_used"] = max(used_values)
                if percent_values:
                    stats["max_percent"] = max(percent_values)
                    
        except Exception as e:
            print(f"读取监控日志失败: {e}")
            
        return stats