gpu_metrics_collector.py 3.04 KB
Newer Older
xuanbaby's avatar
DTK-x  
xuanbaby committed
1
2
3
4
5
6
7
8
9
10
11
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import os
import subprocess
import sys
import time
import traceback

from xml.dom import minidom
xuanbaby's avatar
DTK-203  
xuanbaby committed
12
import json
xuanbaby's avatar
DTK-x  
xuanbaby committed
13
14
15
16

def main(argv):
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']

xuanbaby's avatar
DTK-203  
xuanbaby committed
17
    cmd = 'rocm-smi -a --json'.split()
xuanbaby's avatar
DTK-x  
xuanbaby committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
    while(True):
        try:
            smi_output = subprocess.check_output(cmd)
        except Exception:
            traceback.print_exc()
            gen_empty_gpu_metric(metrics_output_dir)
            break
        parse_nvidia_smi_result(smi_output, metrics_output_dir)
        # TODO: change to sleep time configurable via arguments
        time.sleep(5)


def parse_nvidia_smi_result(smi, outputDir):
    try:
        old_umask = os.umask(0)
xuanbaby's avatar
DTK-203  
xuanbaby committed
33
34
35
36
        #xmldoc = minidom.parseString(smi)
        smi = json.loads(smi)
        #gpuList = xmldoc.getElementsByTagName('gpu')
        gpuList = smi.keys()
xuanbaby's avatar
DTK-x  
xuanbaby committed
37
38
39
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
xuanbaby's avatar
DTK-203  
xuanbaby committed
40
            outPut["gpuCount"] = len(gpuList) - 1
xuanbaby's avatar
DTK-x  
xuanbaby committed
41
42
            outPut["gpuInfos"] = []
            for gpuIndex, gpu in enumerate(gpuList):
xuanbaby's avatar
DTK-203  
xuanbaby committed
43
44
                if gpu == 'system':
                  continue
xuanbaby's avatar
DTK-x  
xuanbaby committed
45
46
                gpuInfo = {}
                gpuInfo['index'] = gpuIndex
xuanbaby's avatar
DTK-203  
xuanbaby committed
47
48
49
50
51
52
53
54
55
56
57
                gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
                gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
#                gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
#                    .getElementsByTagName('gpu_util')[0]\
#                    .childNodes[0].data.replace("%", "").strip()
#                gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
#                    .getElementsByTagName('memory_util')[0]\
#                    .childNodes[0].data.replace("%", "").strip()
#                processes = gpu.getElementsByTagName('processes')
#                runningProNumber = len(processes[0].getElementsByTagName('process_info'))
#                gpuInfo['activeProcessNum'] = runningProNumber
xuanbaby's avatar
DTK-x  
xuanbaby committed
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

                outPut["gpuInfos"].append(gpuInfo)
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
            outputFile.flush()
    except Exception as error:
        # e_info = sys.exc_info()
        print('gpu_metrics_collector error: %s' % error)
    finally:
        os.umask(old_umask)


def gen_empty_gpu_metric(outputDir):
    try:
        old_umask = os.umask(0)
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
            outPut["gpuCount"] = 0
            outPut["gpuInfos"] = []
            print(outPut)
            outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
            outputFile.flush()
    except Exception:
        traceback.print_exc()
    finally:
        os.umask(old_umask)


if __name__ == "__main__":
    main(sys.argv[1:])