"setup.py" did not exist on "64f17c2f369e612cc297d358f607307a615bbb59"
Commit 662457ba authored by xuanbaby's avatar xuanbaby
Browse files

DTK-203

feat(read infos of rocm-smi for nni)
parent c377abcf
...@@ -8,6 +8,7 @@ trainingServicePlatform: local ...@@ -8,6 +8,7 @@ trainingServicePlatform: local
searchSpacePath: search_space.json searchSpacePath: search_space.json
#choice: true, false #choice: true, false
useAnnotation: false useAnnotation: false
#useActiveGpu: true
tuner: tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
#SMAC (SMAC should be installed through nnictl) #SMAC (SMAC should be installed through nnictl)
...@@ -15,7 +16,8 @@ tuner: ...@@ -15,7 +16,8 @@ tuner:
classArgs: classArgs:
#choice: maximize, minimize #choice: maximize, minimize
optimize_mode: maximize optimize_mode: maximize
gpuIndices: 1
trial: trial:
command: python3 mnist-keras.py command: HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/ python3 mnist-keras.py
codeDir: . codeDir: .
gpuNum: 0 gpuNum: 1
...@@ -65,7 +65,7 @@ def load_mnist_data(args): ...@@ -65,7 +65,7 @@ def load_mnist_data(args):
''' '''
mnist_path = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'mnist.npz') mnist_path = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'mnist.npz')
(x_train, y_train), (x_test, y_test) = mnist.load_data(path=mnist_path) (x_train, y_train), (x_test, y_test) = mnist.load_data(path=mnist_path)
os.remove(mnist_path) # os.remove(mnist_path)
x_train = (np.expand_dims(x_train, -1).astype(np.float) / 255.)[:args.num_train] x_train = (np.expand_dims(x_train, -1).astype(np.float) / 255.)[:args.num_train]
x_test = (np.expand_dims(x_test, -1).astype(np.float) / 255.)[:args.num_test] x_test = (np.expand_dims(x_test, -1).astype(np.float) / 255.)[:args.num_test]
...@@ -128,6 +128,7 @@ if __name__ == '__main__': ...@@ -128,6 +128,7 @@ if __name__ == '__main__':
# get parameters from tuner # get parameters from tuner
RECEIVED_PARAMS = nni.get_next_parameter() RECEIVED_PARAMS = nni.get_next_parameter()
LOG.debug(RECEIVED_PARAMS) LOG.debug(RECEIVED_PARAMS)
print("xuana ",RECEIVED_PARAMS)
PARAMS = generate_default_params() PARAMS = generate_default_params()
PARAMS.update(RECEIVED_PARAMS) PARAMS.update(RECEIVED_PARAMS)
# train # train
......
...@@ -113,7 +113,7 @@ def parse_rev_args(receive_msg): ...@@ -113,7 +113,7 @@ def parse_rev_args(receive_msg):
# parallel model # parallel model
try: try:
available_devices = os.environ["CUDA_VISIBLE_DEVICES"] available_devices = os.environ["HIP_VISIBLE_DEVICES"]
gpus = len(available_devices.split(",")) gpus = len(available_devices.split(","))
if gpus > 1: if gpus > 1:
net = multi_gpu_model(net, gpus) net = multi_gpu_model(net, gpus)
...@@ -197,6 +197,7 @@ if __name__ == "__main__": ...@@ -197,6 +197,7 @@ if __name__ == "__main__":
# trial get next parameter from network morphism tuner # trial get next parameter from network morphism tuner
RCV_CONFIG = nni.get_next_parameter() RCV_CONFIG = nni.get_next_parameter()
logger.debug(RCV_CONFIG) logger.debug(RCV_CONFIG)
print(RCV_CONFIG)
parse_rev_args(RCV_CONFIG) parse_rev_args(RCV_CONFIG)
train_eval() train_eval()
except Exception as exception: except Exception as exception:
......
...@@ -24,6 +24,6 @@ tuner: ...@@ -24,6 +24,6 @@ tuner:
#number of classes #number of classes
n_output_node: 10 n_output_node: 10
trial: trial:
command: python3 FashionMNIST_keras.py command: HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/examples/trials/network_morphism/FashionMNIST/output python3 FashionMNIST_keras.py
codeDir: . codeDir: .
gpuNum: 1 gpuNum: 1
...@@ -9,12 +9,12 @@ import time ...@@ -9,12 +9,12 @@ import time
import traceback import traceback
from xml.dom import minidom from xml.dom import minidom
import json
def main(argv): def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR'] metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
cmd = 'nvidia-smi -q -x'.split() cmd = 'rocm-smi -a --json'.split()
while(True): while(True):
try: try:
smi_output = subprocess.check_output(cmd) smi_output = subprocess.check_output(cmd)
...@@ -30,25 +30,31 @@ def main(argv): ...@@ -30,25 +30,31 @@ def main(argv):
def parse_nvidia_smi_result(smi, outputDir): def parse_nvidia_smi_result(smi, outputDir):
try: try:
old_umask = os.umask(0) old_umask = os.umask(0)
xmldoc = minidom.parseString(smi) #xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu') smi = json.loads(smi)
#gpuList = xmldoc.getElementsByTagName('gpu')
gpuList = smi.keys()
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {} outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime()) outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = len(gpuList) outPut["gpuCount"] = len(gpuList) - 1
outPut["gpuInfos"] = [] outPut["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList): for gpuIndex, gpu in enumerate(gpuList):
if gpu == 'system':
continue
gpuInfo = {} gpuInfo = {}
gpuInfo['index'] = gpuIndex gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
.getElementsByTagName('gpu_util')[0]\ gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
.childNodes[0].data.replace("%", "").strip() # gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ # .getElementsByTagName('gpu_util')[0]\
.getElementsByTagName('memory_util')[0]\ # .childNodes[0].data.replace("%", "").strip()
.childNodes[0].data.replace("%", "").strip() # gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
processes = gpu.getElementsByTagName('processes') # .getElementsByTagName('memory_util')[0]\
runningProNumber = len(processes[0].getElementsByTagName('process_info')) # .childNodes[0].data.replace("%", "").strip()
gpuInfo['activeProcessNum'] = runningProNumber # processes = gpu.getElementsByTagName('processes')
# runningProNumber = len(processes[0].getElementsByTagName('process_info'))
# gpuInfo['activeProcessNum'] = runningProNumber
outPut["gpuInfos"].append(gpuInfo) outPut["gpuInfos"].append(gpuInfo)
print(outPut) print(outPut)
......
...@@ -8,7 +8,7 @@ from xml.dom import minidom ...@@ -8,7 +8,7 @@ from xml.dom import minidom
def collect_gpu_usage(node_id): def collect_gpu_usage(node_id):
cmd = 'nvidia-smi -q -x'.split() cmd = 'rocm-smi -a --json'.split()
info = None info = None
try: try:
smi_output = subprocess.check_output(cmd) smi_output = subprocess.check_output(cmd)
...@@ -22,33 +22,40 @@ def collect_gpu_usage(node_id): ...@@ -22,33 +22,40 @@ def collect_gpu_usage(node_id):
def parse_nvidia_smi_result(smi): def parse_nvidia_smi_result(smi):
try: try:
output = {} output = {}
xmldoc = minidom.parseString(smi) # xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu') # gpuList = xmldoc.getElementsByTagName('gpu')
smi = json.loads(smi)
gpuList = smi.keys()
output["Timestamp"] = time.asctime(time.localtime()) output["Timestamp"] = time.asctime(time.localtime())
output["gpuCount"] = len(gpuList) output["gpuCount"] = len(gpuList)
output["gpuInfos"] = [] output["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList): for gpuIndex, gpu in enumerate(gpuList):
if gpu == 'system':
break
gpuInfo = {} gpuInfo = {}
gpuInfo['index'] = gpuIndex gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
.getElementsByTagName('gpu_util')[0]\ gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
.childNodes[0].data.replace("%", "").strip() # gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ # .getElementsByTagName('gpu_util')[0]\
.getElementsByTagName('memory_util')[0]\ # .childNodes[0].data.replace("%", "").strip()
.childNodes[0].data.replace("%", "").strip() # gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
processes = gpu.getElementsByTagName('processes') # .getElementsByTagName('memory_util')[0]\
runningProNumber = len(processes[0].getElementsByTagName('process_info')) # .childNodes[0].data.replace("%", "").strip()
gpuInfo['activeProcessNum'] = runningProNumber # processes = gpu.getElementsByTagName('processes')
# runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ # gpuInfo['activeProcessNum'] = runningProNumber
.childNodes[0].data gpuInfo['gpuType'] = smi[gpu]["GPU ID"]
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] # gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ # .childNodes[0].data
.childNodes[0].data.replace("MiB", "").strip() # memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ gpuInfo['gpuMemUsed'] = smi[gpu]["GPU use (%)"]
.childNodes[0].data.replace("MiB", "").strip() # gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\ # .childNodes[0].data.replace("MiB", "").strip()
.childNodes[0].data.replace("MiB", "").strip() # gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
# gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
output["gpuInfos"].append(gpuInfo) output["gpuInfos"].append(gpuInfo)
except Exception: except Exception:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment