DTK-203

feat(read infos of rocm-smi for nni)

DTK-203
feat(read infos of rocm-smi for nni)
662457ba · xuanbaby · c377abcf · 662457ba · 662457ba · 662457ba
Commit 662457ba authored Sep 24, 2020 by xuanbaby
6 changed files
--- a/examples/trials/mnist-keras/config.yml
+++ b/examples/trials/mnist-keras/config.yml
@@ -8,6 +8,7 @@ trainingServicePlatform: local
 searchSpacePath: search_space.json
 #choice: true, false
 useAnnotation: false
+#useActiveGpu: true
 tuner:
  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
  #SMAC (SMAC should be installed through nnictl)
@@ -15,7 +16,8 @@ tuner:
  classArgs:
    #choice: maximize, minimize
    optimize_mode: maximize
+  gpuIndices: 1
 trial:
-  command: python3 mnist-keras.py
+  command: HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/ python3 mnist-keras.py
  codeDir: .
-  gpuNum: 0
+  gpuNum: 1
--- a/examples/trials/mnist-keras/mnist-keras.py
+++ b/examples/trials/mnist-keras/mnist-keras.py
@@ -65,7 +65,7 @@ def load_mnist_data(args):
    '''
    mnist_path = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'mnist.npz')
    (x_train, y_train), (x_test, y_test) = mnist.load_data(path=mnist_path)
-    os.remove(mnist_path)
+#    os.remove(mnist_path)

    x_train = (np.expand_dims(x_train, -1).astype(np.float) / 255.)[:args.num_train]
    x_test = (np.expand_dims(x_test, -1).astype(np.float) / 255.)[:args.num_test]
@@ -128,6 +128,7 @@ if __name__ == '__main__':
        # get parameters from tuner
        RECEIVED_PARAMS = nni.get_next_parameter()
        LOG.debug(RECEIVED_PARAMS)
+        print("xuana ",RECEIVED_PARAMS)
        PARAMS = generate_default_params()
        PARAMS.update(RECEIVED_PARAMS)
        # train

--- a/examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py
+++ b/examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py
@@ -113,7 +113,7 @@ def parse_rev_args(receive_msg):

    # parallel model
    try:
-        available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
+        available_devices = os.environ["HIP_VISIBLE_DEVICES"]
        gpus = len(available_devices.split(","))
        if gpus > 1:
            net = multi_gpu_model(net, gpus)
@@ -197,6 +197,7 @@ if __name__ == "__main__":
        # trial get next parameter from network morphism tuner
        RCV_CONFIG = nni.get_next_parameter()
        logger.debug(RCV_CONFIG)
+        print(RCV_CONFIG)
        parse_rev_args(RCV_CONFIG)
        train_eval()
    except Exception as exception:

--- a/examples/trials/network_morphism/FashionMNIST/config.yml
+++ b/examples/trials/network_morphism/FashionMNIST/config.yml
@@ -24,6 +24,6 @@ tuner:
    #number of classes
    n_output_node: 10
 trial:
-  command: python3 FashionMNIST_keras.py
+  command: HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/examples/trials/network_morphism/FashionMNIST/output python3 FashionMNIST_keras.py
  codeDir: .
  gpuNum: 1
--- a/tools/nni_gpu_tool/gpu_metrics_collector.py
+++ b/tools/nni_gpu_tool/gpu_metrics_collector.py
@@ -9,12 +9,12 @@ import time
 import traceback

 from xml.dom import minidom
-
+import json

 def main(argv):
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']

-    cmd = 'nvidia-smi -q -x'.split()
+    cmd = 'rocm-smi -a --json'.split()
    while(True):
        try:
            smi_output = subprocess.check_output(cmd)
@@ -30,25 +30,31 @@ def main(argv):
 def parse_nvidia_smi_result(smi, outputDir):
    try:
        old_umask = os.umask(0)
-        xmldoc = minidom.parseString(smi)
-        gpuList = xmldoc.getElementsByTagName('gpu')
+        #xmldoc = minidom.parseString(smi)
+        smi = json.loads(smi)
+        #gpuList = xmldoc.getElementsByTagName('gpu')
+        gpuList = smi.keys()
        with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
            outPut = {}
            outPut["Timestamp"] = time.asctime(time.localtime())
-            outPut["gpuCount"] = len(gpuList)
+            outPut["gpuCount"] = len(gpuList) - 1
            outPut["gpuInfos"] = []
            for gpuIndex, gpu in enumerate(gpuList):
+                if gpu == 'system':
+                  continue
                gpuInfo = {}
                gpuInfo['index'] = gpuIndex
-                gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
-                    .getElementsByTagName('gpu_util')[0]\
-                    .childNodes[0].data.replace("%", "").strip()
-                gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
-                    .getElementsByTagName('memory_util')[0]\
-                    .childNodes[0].data.replace("%", "").strip()
-                processes = gpu.getElementsByTagName('processes')
-                runningProNumber = len(processes[0].getElementsByTagName('process_info'))
-                gpuInfo['activeProcessNum'] = runningProNumber
+                gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
+                gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
+#                gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
+#                    .getElementsByTagName('gpu_util')[0]\
+#                    .childNodes[0].data.replace("%", "").strip()
+#                gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
+#                    .getElementsByTagName('memory_util')[0]\
+#                    .childNodes[0].data.replace("%", "").strip()
+#                processes = gpu.getElementsByTagName('processes')
+#                runningProNumber = len(processes[0].getElementsByTagName('process_info'))
+#                gpuInfo['activeProcessNum'] = runningProNumber

                outPut["gpuInfos"].append(gpuInfo)
            print(outPut)

--- a/tools/nni_trial_tool/gpu.py
+++ b/tools/nni_trial_tool/gpu.py
@@ -8,7 +8,7 @@ from xml.dom import minidom


 def collect_gpu_usage(node_id):
-    cmd = 'nvidia-smi -q -x'.split()
+    cmd = 'rocm-smi -a --json'.split()
    info = None
    try:
        smi_output = subprocess.check_output(cmd)
@@ -22,33 +22,40 @@ def collect_gpu_usage(node_id):
 def parse_nvidia_smi_result(smi):
    try:
        output = {}
-        xmldoc = minidom.parseString(smi)
-        gpuList = xmldoc.getElementsByTagName('gpu')
+#        xmldoc = minidom.parseString(smi)
+#        gpuList = xmldoc.getElementsByTagName('gpu')
+        smi = json.loads(smi)
+        gpuList = smi.keys()
        output["Timestamp"] = time.asctime(time.localtime())
        output["gpuCount"] = len(gpuList)
        output["gpuInfos"] = []
        for gpuIndex, gpu in enumerate(gpuList):
+            if gpu == 'system':  
+              break
            gpuInfo = {}
            gpuInfo['index'] = gpuIndex
-            gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
-                .getElementsByTagName('gpu_util')[0]\
-                .childNodes[0].data.replace("%", "").strip()
-            gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
-                .getElementsByTagName('memory_util')[0]\
-                .childNodes[0].data.replace("%", "").strip()
-            processes = gpu.getElementsByTagName('processes')
-            runningProNumber = len(processes[0].getElementsByTagName('process_info'))
-            gpuInfo['activeProcessNum'] = runningProNumber
-
-            gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
-                .childNodes[0].data
-            memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
-            gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
-                .childNodes[0].data.replace("MiB", "").strip()
-            gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
-                .childNodes[0].data.replace("MiB", "").strip()
-            gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
-                .childNodes[0].data.replace("MiB", "").strip()
+            gpuInfo['gpuUtil'] = smi[gpu]["GPU OverDrive value (%)"]
+            gpuInfo['gpuMemUtil'] = smi[gpu]["GPU Memory OverDrive value (%)"]
+#            gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
+#                .getElementsByTagName('gpu_util')[0]\
+#                .childNodes[0].data.replace("%", "").strip()
+#            gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
+#                .getElementsByTagName('memory_util')[0]\
+#                .childNodes[0].data.replace("%", "").strip()
+#            processes = gpu.getElementsByTagName('processes')
+#            runningProNumber = len(processes[0].getElementsByTagName('process_info'))
+#            gpuInfo['activeProcessNum'] = runningProNumber
+            gpuInfo['gpuType'] = smi[gpu]["GPU ID"]
+#            gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
+#                .childNodes[0].data
+#            memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
+            gpuInfo['gpuMemUsed'] = smi[gpu]["GPU use (%)"]
+#            gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
+#                .childNodes[0].data.replace("MiB", "").strip()
+#            gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
+#                .childNodes[0].data.replace("MiB", "").strip()
+#            gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
+#                .childNodes[0].data.replace("MiB", "").strip()

            output["gpuInfos"].append(gpuInfo)
    except Exception: