stat.py 3.91 KB
Newer Older
yuguo960516's avatar
yuguo960516 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import sys
import signal
import subprocess
import threading


class CudaUtilMemStat:
    def __init__(
        self, stat_file_path, stat_util=True, stat_mem=True, only_ordinal=None
    ):
        self.stat_file = open(stat_file_path, "wt")
        self.stat_util = stat_util
        self.stat_mem = stat_mem
        self.only_ordinal = only_ordinal
        self._write_titles()

    def __del__(self):
        self.stat_file.close()

    def _write_titles(self):
        proc = subprocess.Popen(
            ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE,
        )
        stdout, stderr = proc.communicate()
        if stderr != b"":
            raise RuntimeError(stderr)

        gpus = []
        lines = stdout.decode("utf-8").split("\n")
        for line in lines:
            if line.strip() == "":
                continue
            gpus.append(line.split(":")[0])

        util_titles, mem_titles = [], []
        for gpu in gpus:
            if self.stat_util:
                util_titles.append(gpu + " utilization")
            if self.stat_mem:
                mem_titles.append(gpu + " memory used")

        if self.only_ordinal is None:
            self.stat_file.write(",".join(util_titles + mem_titles) + "\n")
        else:
            titles = (
                util_titles[self.only_ordinal] + "," + mem_titles[self.only_ordinal]
            )
            self.stat_file.write(titles + "\n")
            self.stat_file.flush()

    def stat(self):
        # command: nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv
        proc = subprocess.Popen(
            ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        stdout, stderr = proc.communicate()
        if stderr != b"":
            raise RuntimeError(stderr)

        lines = stdout.decode("utf-8").split("\n")
        util_vals = []
        mem_vals = []
        for line in lines[1:]:
            if line.strip() == "":
                continue
            util_mem = line.split(",")
            assert len(util_mem) == 2, lines
            util = util_mem[0].strip()
            mem = util_mem[1].strip()
            assert "%" in util
            assert "MiB" in mem
            util = util.split(" ")
            mem = mem.split(" ")
            assert len(util) == 2
            assert len(mem) == 2
            util_val = util[0].strip()
            mem_val = mem[0].strip()
            if self.stat_util:
                util_vals.append(util_val)
            if self.stat_mem:
                mem_vals.append(mem_val)

        if self.only_ordinal is None:
            self.stat_file.write(",".join(util_vals + mem_vals) + "\n")
        else:
            vals = util_vals[self.only_ordinal] + "," + mem_vals[self.only_ordinal]
            self.stat_file.write(vals + "\n")
            self.stat_file.flush()

    def start(self, interval):
        stop = threading.Event()
        stat_thrd = StatThread(self.stat, interval, stop)
        stat_thrd.start()

        def close(signum, frame):
            print("Closing...")
            stop.set()

        signal.signal(signal.SIGTERM, close)
        signal.signal(signal.SIGINT, close)
        print("Start stat")
        print("Print Ctrl+C to stop")
        stop.wait()


class StatThread(threading.Thread):
    def __init__(self, handler, interval, stop_event):
        super().__init__()
        self.handler = handler
        self.interval = interval
        self.stopped = stop_event
        self.count = 0

    def run(self):
        while not self.stopped.wait(self.interval):
            print(f"{self.count} th run stat")
            self.handler()
            self.count += 1


if __name__ == "__main__":
    stat_file_path = sys.argv[1] if len(sys.argv) > 1 else "gpu_stat.log"
    interval = sys.argv[2] if len(sys.argv) > 2 else 1
    stat = CudaUtilMemStat(stat_file_path)
    stat.start(interval)