gpu_metrics_collector.py 3.74 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5
6
7
8

import json
import os
import subprocess
import sys
import time
9
import traceback
Deshui Yu's avatar
Deshui Yu committed
10
11
12

from xml.dom import minidom

13

Deshui Yu's avatar
Deshui Yu committed
14
def check_ready_to_run():
15
    if sys.platform == 'win32':
chicm-ms's avatar
chicm-ms committed
16
17
        pgrep_output = subprocess.check_output(
            'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
18
        pidList = pgrep_output.decode("utf-8").strip().split()
19
        pidList.pop(0)  # remove the key word 'ProcessId'
20
21
        pidList = list(map(int, pidList))
        pidList.remove(os.getpid())
chicm-ms's avatar
chicm-ms committed
22
        return not pidList
23
    else:
24
        pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
25
26
        pidList = []
        for pid in pgrep_output.splitlines():
27
28
29
30
            pid = pid.decode()
            if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
                continue
            pidList.append(pid)
chicm-ms's avatar
chicm-ms committed
31
        return not pidList
Deshui Yu's avatar
Deshui Yu committed
32

33

Deshui Yu's avatar
Deshui Yu committed
34
def main(argv):
SparkSnail's avatar
SparkSnail committed
35
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
Deshui Yu's avatar
Deshui Yu committed
36
    if check_ready_to_run() == False:
37
        print("GPU metrics collector is already running. exiting...")
Deshui Yu's avatar
Deshui Yu committed
38
        exit(2)
39
    cmd = 'nvidia-smi -q -x'.split()
Deshui Yu's avatar
Deshui Yu committed
40
41
    while(True):
        try:
42
43
44
45
46
47
            smi_output = subprocess.check_output(cmd)
        except Exception:
            traceback.print_exc()
            gen_empty_gpu_metric(metrics_output_dir)
            break
        parse_nvidia_smi_result(smi_output, metrics_output_dir)
Deshui Yu's avatar
Deshui Yu committed
48
49
50
        # TODO: change to sleep time configurable via arguments
        time.sleep(5)

51

Deshui Yu's avatar
Deshui Yu committed
52
53
def parse_nvidia_smi_result(smi, outputDir):
    try:
54
        old_umask = os.umask(0)
Deshui Yu's avatar
Deshui Yu committed
55
56
57
58
59
60
61
62
        xmldoc = minidom.parseString(smi)
        gpuList = xmldoc.getElementsByTagName('gpu')
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = len(gpuList)
            outPut["gpuInfos"] = []
            for gpuIndex, gpu in enumerate(gpuList):
chicm-ms's avatar
chicm-ms committed
63
                gpuInfo = {}
Deshui Yu's avatar
Deshui Yu committed
64
                gpuInfo['index'] = gpuIndex
chicm-ms's avatar
chicm-ms committed
65
66
67
68
69
70
                gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
                    .getElementsByTagName('gpu_util')[0]\
                    .childNodes[0].data.replace("%", "").strip()
                gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
                    .getElementsByTagName('memory_util')[0]\
                    .childNodes[0].data.replace("%", "").strip()
Deshui Yu's avatar
Deshui Yu committed
71
72
73
74
75
76
77
                processes = gpu.getElementsByTagName('processes')
                runningProNumber = len(processes[0].getElementsByTagName('process_info'))
                gpuInfo['activeProcessNum'] = runningProNumber

                outPut["gpuInfos"].append(gpuInfo)
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
78
79
            outputFile.flush()
    except Exception as error:
chicm-ms's avatar
chicm-ms committed
80
        # e_info = sys.exc_info()
81
        print('gpu_metrics_collector error: %s' % error)
82
83
84
    finally:
        os.umask(old_umask)

85

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
def gen_empty_gpu_metric(outputDir):
    try:
        old_umask = os.umask(0)
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = 0
            outPut["gpuInfos"] = []
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
            outputFile.flush()
    except Exception:
        traceback.print_exc()
    finally:
        os.umask(old_umask)
Deshui Yu's avatar
Deshui Yu committed
101
102
103
104


if __name__ == "__main__":
    main(sys.argv[1:])