gpu_metrics_collector.py 3.57 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5
6
7
8

import json
import os
import subprocess
import sys
import time
9
import traceback
Deshui Yu's avatar
Deshui Yu committed
10
11
12
13

from xml.dom import minidom

def check_ready_to_run():
14
    if sys.platform == 'win32':
chicm-ms's avatar
chicm-ms committed
15
16
        pgrep_output = subprocess.check_output(
            'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
17
18
19
20
        pidList = pgrep_output.decode("utf-8").strip().split()
        pidList.pop(0) # remove the key word 'ProcessId'
        pidList = list(map(int, pidList))
        pidList.remove(os.getpid())
chicm-ms's avatar
chicm-ms committed
21
        return not pidList
22
    else:
23
        pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
24
25
26
27
        pidList = []
        for pid in pgrep_output.splitlines():
            pidList.append(int(pid))
        pidList.remove(os.getpid())
chicm-ms's avatar
chicm-ms committed
28
        return not pidList
Deshui Yu's avatar
Deshui Yu committed
29
30

def main(argv):
SparkSnail's avatar
SparkSnail committed
31
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
Deshui Yu's avatar
Deshui Yu committed
32
33
34
    if check_ready_to_run() == False:
        # GPU metrics collector is already running. Exit
        exit(2)
35
    cmd = 'nvidia-smi -q -x'.split()
Deshui Yu's avatar
Deshui Yu committed
36
37
    while(True):
        try:
38
39
40
41
42
43
            smi_output = subprocess.check_output(cmd)
        except Exception:
            traceback.print_exc()
            gen_empty_gpu_metric(metrics_output_dir)
            break
        parse_nvidia_smi_result(smi_output, metrics_output_dir)
Deshui Yu's avatar
Deshui Yu committed
44
45
46
47
48
        # TODO: change to sleep time configurable via arguments
        time.sleep(5)

def parse_nvidia_smi_result(smi, outputDir):
    try:
49
        old_umask = os.umask(0)
Deshui Yu's avatar
Deshui Yu committed
50
51
52
53
54
55
56
57
        xmldoc = minidom.parseString(smi)
        gpuList = xmldoc.getElementsByTagName('gpu')
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = len(gpuList)
            outPut["gpuInfos"] = []
            for gpuIndex, gpu in enumerate(gpuList):
chicm-ms's avatar
chicm-ms committed
58
                gpuInfo = {}
Deshui Yu's avatar
Deshui Yu committed
59
                gpuInfo['index'] = gpuIndex
chicm-ms's avatar
chicm-ms committed
60
61
62
63
64
65
                gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
                    .getElementsByTagName('gpu_util')[0]\
                    .childNodes[0].data.replace("%", "").strip()
                gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
                    .getElementsByTagName('memory_util')[0]\
                    .childNodes[0].data.replace("%", "").strip()
Deshui Yu's avatar
Deshui Yu committed
66
67
68
69
70
71
72
73
                processes = gpu.getElementsByTagName('processes')
                runningProNumber = len(processes[0].getElementsByTagName('process_info'))
                gpuInfo['activeProcessNum'] = runningProNumber

                outPut["gpuInfos"].append(gpuInfo)
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
            outputFile.flush();
chicm-ms's avatar
chicm-ms committed
74
75
    except:
        # e_info = sys.exc_info()
Deshui Yu's avatar
Deshui Yu committed
76
        print('xmldoc paring error')
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    finally:
        os.umask(old_umask)

def gen_empty_gpu_metric(outputDir):
    try:
        old_umask = os.umask(0)
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = 0
            outPut["gpuInfos"] = []
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
            outputFile.flush()
    except Exception:
        traceback.print_exc()
    finally:
        os.umask(old_umask)
Deshui Yu's avatar
Deshui Yu committed
95
96
97
98


if __name__ == "__main__":
    main(sys.argv[1:])