# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import subprocess import time import traceback from xml.dom import minidom def collect_gpu_usage(node_id): cmd = 'rocm-smi -a --json'.split() info = None try: smi_output = subprocess.check_output(cmd) info = parse_nvidia_smi_result(smi_output) except Exception: traceback.print_exc() info = gen_empty_gpu_metric() return info def parse_nvidia_smi_result(smi): try: output = {} # xmldoc = minidom.parseString(smi) # gpuList = xmldoc.getElementsByTagName('gpu') smi = json.loads(smi) gpuList = smi.keys() output["Timestamp"] = time.asctime(time.localtime()) output["gpuCount"] = len(gpuList) output["gpuInfos"] = [] for gpuIndex, gpu in enumerate(gpuList): if gpu == 'system': continue gpuInfo = {} gpuInfo['index'] = gpuIndex gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"] gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"] # gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ # .getElementsByTagName('gpu_util')[0]\ # .childNodes[0].data.replace("%", "").strip() # gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ # .getElementsByTagName('memory_util')[0]\ # .childNodes[0].data.replace("%", "").strip() # processes = gpu.getElementsByTagName('processes') # runningProNumber = len(processes[0].getElementsByTagName('process_info')) # gpuInfo['activeProcessNum'] = runningProNumber gpuInfo['gpuType'] = smi[gpu]["GPU ID"] # gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ # .childNodes[0].data # memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] gpuInfo['gpuMemUsed'] = smi[gpu]["GPU use (%)"] # gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ # .childNodes[0].data.replace("MiB", "").strip() # gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ # .childNodes[0].data.replace("MiB", "").strip() # gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\ # .childNodes[0].data.replace("MiB", "").strip() output["gpuInfos"].append(gpuInfo) except Exception: traceback.print_exc() output = {} return output def gen_empty_gpu_metric(): try: output = {} output["Timestamp"] = time.asctime(time.localtime()) output["gpuCount"] = 0 output["gpuInfos"] = [] except Exception: traceback.print_exc() output = {} return output