"...resnet50_tensorflow.git" did not exist on "bef8f110aa1a53a56873445726cc2082e03efb11"
Commit 3e124af1 authored by qianyj's avatar qianyj
Browse files

[DCU] Fit for DTK

parent c31d2574
......@@ -14,13 +14,13 @@ from xml.dom import minidom
def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
cmd = 'nvidia-smi -q -x'.split()
cmd = 'rocm-smi --showuse --showmemuse --json'.split()
retry = 0
while True:
smi = subprocess.run(cmd, timeout=20, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if smi.returncode != 0:
retry += 1
print(f'gpu_metrics_collector error: nvidia-smi return code is {smi.returncode}', file=sys.stderr)
print(f'gpu_metrics_collector error: rocm-smi return code is {smi.returncode}', file=sys.stderr)
print('=' * 20 + f'\nCaptured stdout: {smi.stdout}', file=sys.stderr)
print('=' * 20 + f'\nCaptured stderr: {smi.stderr}', file=sys.stderr)
gen_empty_gpu_metric(metrics_output_dir)
......@@ -35,8 +35,7 @@ def main(argv):
def parse_nvidia_smi_result(smi, outputDir):
old_umask = os.umask(0)
try:
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
gpuList = eval(smi)
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
......@@ -45,14 +44,10 @@ def parse_nvidia_smi_result(smi, outputDir):
for gpuIndex, gpu in enumerate(gpuList):
gpuInfo = {}
gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('gpu_util')[0]\
.childNodes[0].data.replace("%", "").strip()
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['gpuUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[0]] + "%"
gpuInfo['gpuMemUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[1]] + "%"
# can not find the runingProNumber. just put 1 to here
runningProNumber = 1
gpuInfo['activeProcessNum'] = runningProNumber
outPut["gpuInfos"].append(gpuInfo)
......
......@@ -8,7 +8,7 @@ from xml.dom import minidom
def collect_gpu_usage(node_id):
cmd = 'nvidia-smi -q -x'.split()
cmd = 'rocm-smi --showuse --showmemuse --showmeminfo vis_vram --showid --json'.split()
info = None
try:
smi_output = subprocess.check_output(cmd)
......@@ -22,33 +22,25 @@ def collect_gpu_usage(node_id):
def parse_nvidia_smi_result(smi):
try:
output = {}
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
gpuList = eval(smi)
output["Timestamp"] = time.asctime(time.localtime())
output["gpuCount"] = len(gpuList)
output["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList):
gpuInfo = {}
gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('gpu_util')[0]\
.childNodes[0].data.replace("%", "").strip()
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['gpuUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[1]] + "%"
gpuInfo['gpuMemUtil'] = gpuList[gpu][list(gpuList[gpu].keys())[2]] + "%"
runningProNumber = 1
gpuInfo['activeProcessNum'] = runningProNumber
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
.childNodes[0].data
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuType'] = gpuList[gpu][list(gpuList[gpu].keys())[0]]
gpuInfo['gpuMemTotal'] = round(float(gpuList[gpu][list(gpuList[gpu].keys())[3]])/1048576, 2)
gpuInfo['gpuMemUsed'] = round(float(gpuList[gpu][list(gpuList[gpu].keys())[4]])/1048576, 2)
gpuInfo['gpuMemFree'] = str(gpuInfo['gpuMemTotal'] - gpuInfo['gpuMemUsed'])
gpuInfo['gpuMemTotal'] = str(gpuInfo['gpuMemTotal']) + "MB"
gpuInfo['gpuMemUsed'] = str(gpuInfo['gpuMemUsed']) + "MB"
gpuInfo['gpuMemFree'] = str(gpuInfo['gpuMemFree']) + "MB"
output["gpuInfos"].append(gpuInfo)
except Exception:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment