Commit 3e124af1 authored by qianyj's avatar qianyj
Browse files

[DCU] Fit for DTK

parent c31d2574
...@@ -14,13 +14,13 @@ from xml.dom import minidom ...@@ -14,13 +14,13 @@ from xml.dom import minidom
def main(argv): def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR'] metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
cmd = 'nvidia-smi -q -x'.split() cmd = 'rocm-smi --showuse --showmemuse --json'.split()
retry = 0 retry = 0
while True: while True:
smi = subprocess.run(cmd, timeout=20, stdout=subprocess.PIPE, stderr=subprocess.PIPE) smi = subprocess.run(cmd, timeout=20, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if smi.returncode != 0: if smi.returncode != 0:
retry += 1 retry += 1
print(f'gpu_metrics_collector error: nvidia-smi return code is {smi.returncode}', file=sys.stderr) print(f'gpu_metrics_collector error: rocm-smi return code is {smi.returncode}', file=sys.stderr)
print('=' * 20 + f'\nCaptured stdout: {smi.stdout}', file=sys.stderr) print('=' * 20 + f'\nCaptured stdout: {smi.stdout}', file=sys.stderr)
print('=' * 20 + f'\nCaptured stderr: {smi.stderr}', file=sys.stderr) print('=' * 20 + f'\nCaptured stderr: {smi.stderr}', file=sys.stderr)
gen_empty_gpu_metric(metrics_output_dir) gen_empty_gpu_metric(metrics_output_dir)
...@@ -35,8 +35,7 @@ def main(argv): ...@@ -35,8 +35,7 @@ def main(argv):
def parse_nvidia_smi_result(smi, outputDir): def parse_nvidia_smi_result(smi, outputDir):
old_umask = os.umask(0) old_umask = os.umask(0)
try: try:
xmldoc = minidom.parseString(smi) gpuList = eval(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {} outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime()) outPut["Timestamp"] = time.asctime(time.localtime())
...@@ -45,14 +44,10 @@ def parse_nvidia_smi_result(smi, outputDir): ...@@ -45,14 +44,10 @@ def parse_nvidia_smi_result(smi, outputDir):
for gpuIndex, gpu in enumerate(gpuList): for gpuIndex, gpu in enumerate(gpuList):
gpuInfo = {} gpuInfo = {}
gpuInfo['index'] = gpuIndex gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ gpuInfo['gpuUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[0]] + "%"
.getElementsByTagName('gpu_util')[0]\ gpuInfo['gpuMemUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[1]] + "%"
.childNodes[0].data.replace("%", "").strip() # can not find the runingProNumber. just put 1 to here
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ runningProNumber = 1
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['activeProcessNum'] = runningProNumber gpuInfo['activeProcessNum'] = runningProNumber
outPut["gpuInfos"].append(gpuInfo) outPut["gpuInfos"].append(gpuInfo)
......
...@@ -8,7 +8,7 @@ from xml.dom import minidom ...@@ -8,7 +8,7 @@ from xml.dom import minidom
def collect_gpu_usage(node_id): def collect_gpu_usage(node_id):
cmd = 'nvidia-smi -q -x'.split() cmd = 'rocm-smi --showuse --showmemuse --showmeminfo vis_vram --showid --json'.split()
info = None info = None
try: try:
smi_output = subprocess.check_output(cmd) smi_output = subprocess.check_output(cmd)
...@@ -22,33 +22,25 @@ def collect_gpu_usage(node_id): ...@@ -22,33 +22,25 @@ def collect_gpu_usage(node_id):
def parse_nvidia_smi_result(smi): def parse_nvidia_smi_result(smi):
try: try:
output = {} output = {}
xmldoc = minidom.parseString(smi) gpuList = eval(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
output["Timestamp"] = time.asctime(time.localtime()) output["Timestamp"] = time.asctime(time.localtime())
output["gpuCount"] = len(gpuList) output["gpuCount"] = len(gpuList)
output["gpuInfos"] = [] output["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList): for gpuIndex, gpu in enumerate(gpuList):
gpuInfo = {} gpuInfo = {}
gpuInfo['index'] = gpuIndex gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ gpuInfo['gpuUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[1]] + "%"
.getElementsByTagName('gpu_util')[0]\ gpuInfo['gpuMemUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[2]] + "%"
.childNodes[0].data.replace("%", "").strip() runningProNumber = 1
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['activeProcessNum'] = runningProNumber gpuInfo['activeProcessNum'] = runningProNumber
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ gpuInfo['gpuType'] = gpuList[gpu][list(gpuList[gpu].keys())[0]]
.childNodes[0].data gpuInfo['gpuMemTotal'] = round(float(gpuList[gpu][list(gpuList[gpu].keys())[3]])/1048576, 2)
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] gpuInfo['gpuMemUsed'] = round(float(gpuList[gpu][list(gpuList[gpu].keys())[4]])/1048576, 2)
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ gpuInfo['gpuMemFree'] = str(gpuInfo['gpuMemTotal'] - gpuInfo['gpuMemUsed'])
.childNodes[0].data.replace("MiB", "").strip() gpuInfo['gpuMemTotal'] = str(gpuInfo['gpuMemTotal']) + "MB"
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ gpuInfo['gpuMemUsed'] = str(gpuInfo['gpuMemUsed']) + "MB"
.childNodes[0].data.replace("MiB", "").strip() gpuInfo['gpuMemFree'] = str(gpuInfo['gpuMemFree']) + "MB"
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
.childNodes[0].data.replace("MiB", "").strip()
output["gpuInfos"].append(gpuInfo) output["gpuInfos"].append(gpuInfo)
except Exception: except Exception:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment