gpu_metrics_collector.py 2.74 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5
6
7
8

import json
import os
import subprocess
import sys
import time
9
import traceback
Deshui Yu's avatar
Deshui Yu committed
10
11
12

from xml.dom import minidom

13

Deshui Yu's avatar
Deshui Yu committed
14
def main(argv):
SparkSnail's avatar
SparkSnail committed
15
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
16

17
    cmd = 'nvidia-smi -q -x'.split()
Deshui Yu's avatar
Deshui Yu committed
18
19
    while(True):
        try:
20
21
22
23
24
25
            smi_output = subprocess.check_output(cmd)
        except Exception:
            traceback.print_exc()
            gen_empty_gpu_metric(metrics_output_dir)
            break
        parse_nvidia_smi_result(smi_output, metrics_output_dir)
Deshui Yu's avatar
Deshui Yu committed
26
27
28
        # TODO: change to sleep time configurable via arguments
        time.sleep(5)

29

Deshui Yu's avatar
Deshui Yu committed
30
31
def parse_nvidia_smi_result(smi, outputDir):
    try:
32
        old_umask = os.umask(0)
Deshui Yu's avatar
Deshui Yu committed
33
34
35
36
37
38
39
40
        xmldoc = minidom.parseString(smi)
        gpuList = xmldoc.getElementsByTagName('gpu')
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = len(gpuList)
            outPut["gpuInfos"] = []
            for gpuIndex, gpu in enumerate(gpuList):
chicm-ms's avatar
chicm-ms committed
41
                gpuInfo = {}
Deshui Yu's avatar
Deshui Yu committed
42
                gpuInfo['index'] = gpuIndex
chicm-ms's avatar
chicm-ms committed
43
44
45
46
47
48
                gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
                    .getElementsByTagName('gpu_util')[0]\
                    .childNodes[0].data.replace("%", "").strip()
                gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
                    .getElementsByTagName('memory_util')[0]\
                    .childNodes[0].data.replace("%", "").strip()
Deshui Yu's avatar
Deshui Yu committed
49
50
51
52
53
54
55
                processes = gpu.getElementsByTagName('processes')
                runningProNumber = len(processes[0].getElementsByTagName('process_info'))
                gpuInfo['activeProcessNum'] = runningProNumber

                outPut["gpuInfos"].append(gpuInfo)
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
56
57
            outputFile.flush()
    except Exception as error:
chicm-ms's avatar
chicm-ms committed
58
        # e_info = sys.exc_info()
59
        print('gpu_metrics_collector error: %s' % error)
60
61
62
    finally:
        os.umask(old_umask)

63

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def gen_empty_gpu_metric(outputDir):
    try:
        old_umask = os.umask(0)
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = 0
            outPut["gpuInfos"] = []
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
            outputFile.flush()
    except Exception:
        traceback.print_exc()
    finally:
        os.umask(old_umask)
Deshui Yu's avatar
Deshui Yu committed
79
80
81
82


if __name__ == "__main__":
    main(sys.argv[1:])