Commit 662457ba authored by xuanbaby's avatar xuanbaby
Browse files

DTK-203

feat(read infos of rocm-smi for nni)
parent c377abcf
......@@ -8,6 +8,7 @@ trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
#useActiveGpu: true
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
#SMAC (SMAC should be installed through nnictl)
......@@ -15,7 +16,8 @@ tuner:
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
gpuIndices: 1
trial:
command: python3 mnist-keras.py
command: HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/ python3 mnist-keras.py
codeDir: .
gpuNum: 0
gpuNum: 1
......@@ -65,7 +65,7 @@ def load_mnist_data(args):
'''
mnist_path = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'mnist.npz')
(x_train, y_train), (x_test, y_test) = mnist.load_data(path=mnist_path)
os.remove(mnist_path)
# os.remove(mnist_path)
x_train = (np.expand_dims(x_train, -1).astype(np.float) / 255.)[:args.num_train]
x_test = (np.expand_dims(x_test, -1).astype(np.float) / 255.)[:args.num_test]
......@@ -128,6 +128,7 @@ if __name__ == '__main__':
# get parameters from tuner
RECEIVED_PARAMS = nni.get_next_parameter()
LOG.debug(RECEIVED_PARAMS)
print("xuana ",RECEIVED_PARAMS)
PARAMS = generate_default_params()
PARAMS.update(RECEIVED_PARAMS)
# train
......
......@@ -113,7 +113,7 @@ def parse_rev_args(receive_msg):
# parallel model
try:
available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
available_devices = os.environ["HIP_VISIBLE_DEVICES"]
gpus = len(available_devices.split(","))
if gpus > 1:
net = multi_gpu_model(net, gpus)
......@@ -197,6 +197,7 @@ if __name__ == "__main__":
# trial get next parameter from network morphism tuner
RCV_CONFIG = nni.get_next_parameter()
logger.debug(RCV_CONFIG)
print(RCV_CONFIG)
parse_rev_args(RCV_CONFIG)
train_eval()
except Exception as exception:
......
......@@ -24,6 +24,6 @@ tuner:
#number of classes
n_output_node: 10
trial:
command: python3 FashionMNIST_keras.py
command: HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/examples/trials/network_morphism/FashionMNIST/output python3 FashionMNIST_keras.py
codeDir: .
gpuNum: 1
......@@ -9,12 +9,12 @@ import time
import traceback
from xml.dom import minidom
import json
def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
cmd = 'nvidia-smi -q -x'.split()
cmd = 'rocm-smi -a --json'.split()
while(True):
try:
smi_output = subprocess.check_output(cmd)
......@@ -30,25 +30,31 @@ def main(argv):
def parse_nvidia_smi_result(smi, outputDir):
try:
old_umask = os.umask(0)
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
#xmldoc = minidom.parseString(smi)
smi = json.loads(smi)
#gpuList = xmldoc.getElementsByTagName('gpu')
gpuList = smi.keys()
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = len(gpuList)
outPut["gpuCount"] = len(gpuList) - 1
outPut["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList):
if gpu == 'system':
continue
gpuInfo = {}
gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('gpu_util')[0]\
.childNodes[0].data.replace("%", "").strip()
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['activeProcessNum'] = runningProNumber
gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
# gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
# .getElementsByTagName('gpu_util')[0]\
# .childNodes[0].data.replace("%", "").strip()
# gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
# .getElementsByTagName('memory_util')[0]\
# .childNodes[0].data.replace("%", "").strip()
# processes = gpu.getElementsByTagName('processes')
# runningProNumber = len(processes[0].getElementsByTagName('process_info'))
# gpuInfo['activeProcessNum'] = runningProNumber
outPut["gpuInfos"].append(gpuInfo)
print(outPut)
......
......@@ -8,7 +8,7 @@ from xml.dom import minidom
def collect_gpu_usage(node_id):
cmd = 'nvidia-smi -q -x'.split()
cmd = 'rocm-smi -a --json'.split()
info = None
try:
smi_output = subprocess.check_output(cmd)
......@@ -22,33 +22,40 @@ def collect_gpu_usage(node_id):
def parse_nvidia_smi_result(smi):
try:
output = {}
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
# xmldoc = minidom.parseString(smi)
# gpuList = xmldoc.getElementsByTagName('gpu')
smi = json.loads(smi)
gpuList = smi.keys()
output["Timestamp"] = time.asctime(time.localtime())
output["gpuCount"] = len(gpuList)
output["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList):
if gpu == 'system':
break
gpuInfo = {}
gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('gpu_util')[0]\
.childNodes[0].data.replace("%", "").strip()
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
.getElementsByTagName('memory_util')[0]\
.childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['activeProcessNum'] = runningProNumber
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
.childNodes[0].data
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
.childNodes[0].data.replace("MiB", "").strip()
gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
# gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
# .getElementsByTagName('gpu_util')[0]\
# .childNodes[0].data.replace("%", "").strip()
# gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
# .getElementsByTagName('memory_util')[0]\
# .childNodes[0].data.replace("%", "").strip()
# processes = gpu.getElementsByTagName('processes')
# runningProNumber = len(processes[0].getElementsByTagName('process_info'))
# gpuInfo['activeProcessNum'] = runningProNumber
gpuInfo['gpuType'] = smi[gpu]["GPU ID"]
# gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
# .childNodes[0].data
# memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
gpuInfo['gpuMemUsed'] = smi[gpu]["GPU use (%)"]
# gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
# gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
# gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
output["gpuInfos"].append(gpuInfo)
except Exception:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment