Support 'nnictl top' (#464)

Add nnictl top command to monitor the nni experiments.

Support 'nnictl top' (#464)
Add nnictl top command to monitor the nni experiments.
07e19a30 · SparkSnail · GitHub · e31839cc · 07e19a30 · 07e19a30
Unverified Commit 07e19a30 authored Dec 13, 2018 by SparkSnail Committed by GitHub Dec 13, 2018
6 changed files
--- a/docs/NNICTLDOC.md
+++ b/docs/NNICTLDOC.md
@@ -16,6 +16,7 @@ nnictl config
 nnictl log
 nnictl webui
 nnictl tensorboard
+nnictl top
 ```
 ### Manage an experiment
 * __nnictl create__ 
@@ -172,7 +173,24 @@ nnictl tensorboard
          | ------ | ------ | ------ |------ |
         | id|  False| |ID of the experiment you want to set|   
         | --trialid, -t|  True| |ID of the trial you want to kill.| 
+  * __nnictl top__
+      * Description
+        Monitor all of running experiments.
+      * Usage
+              nnictl top
+      	Options:  
+          | Name, shorthand | Required|Default | Description |
+          | ------ | ------ | ------ |------ |
+         | id|  False| |ID of the experiment you want to set|   
+         | --time, -t|  False| |The interval to update the experiment status, the unit of time is second, and the default value is 3 second.| 
 ### Manage experiment information

--- a/tools/nni_cmd/config_utils.py
+++ b/tools/nni_cmd/config_utils.py
@@ -73,7 +73,7 @@ class Experiments:
        self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment')
        self.experiments = self.read_file()
-    def add_experiment(self, id, port, time, file_name):
+    def add_experiment(self, id, port, time, file_name, platform):
        '''set {key:value} paris to self.experiment'''
        self.experiments[id] = {}
        self.experiments[id]['port'] = port
@@ -81,6 +81,7 @@ class Experiments:
        self.experiments[id]['endTime'] = 'N/A'
        self.experiments[id]['status'] = 'running'
        self.experiments[id]['fileName'] = file_name
+        self.experiments[id]['platform'] = platform
        self.write_file()
    def update_experiment(self, id, key, value):

--- a/tools/nni_cmd/constants.py
+++ b/tools/nni_cmd/constants.py
@@ -40,11 +40,12 @@ EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0
                          '         commands                       description\n' \
                          '1. nnictl experiment show        show the information of experiments\n' \
                          '2. nnictl trial ls               list all of trial jobs\n' \
-                          '3. nnictl log stderr             show stderr log content\n' \
+                          '3. nnictl top                    monitor the status of running experiments\n' \
-                          '4. nnictl log stdout             show stdout log content\n' \
+                          '4. nnictl log stderr             show stderr log content\n' \
-                          '5. nnictl stop                   stop an experiment\n' \
+                          '5. nnictl log stdout             show stdout log content\n' \
-                          '6. nnictl trial kill             kill a trial job by id\n' \
+                          '6. nnictl stop                   stop an experiment\n' \
-                          '7. nnictl --help                 get help information about nnictl\n' \
+                          '7. nnictl trial kill             kill a trial job by id\n' \
+                          '8. nnictl --help                 get help information about nnictl\n' \
                          '-----------------------------------------------------------------------\n' \
 LOG_HEADER = '-----------------------------------------------------------------------\n' \
@@ -54,12 +55,23 @@ LOG_HEADER = '------------------------------------------------------------------
 EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \
                               'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n'
-EXPERIMENT_INFORMATION_FORMAT = '-----------------------------------------------------------------------\n' \
+EXPERIMENT_INFORMATION_FORMAT = '----------------------------------------------------------------------------------------\n' \
                     '                Experiment information\n' \
                     '%s\n' \
-                     '-----------------------------------------------------------------------\n'
+                     '----------------------------------------------------------------------------------------\n'
-EXPERIMENT_DETAIL_FORMAT = 'Id: %s    Status: %s    Port: %s    StartTime: %s    EndTime: %s    \n'
+EXPERIMENT_DETAIL_FORMAT = 'Id: %s    Status: %s    Port: %s    Platform: %s    StartTime: %s    EndTime: %s    \n'
+EXPERIMENT_MONITOR_INFO = 'Id: %s    Status: %s    Port: %s    Platform: %s    \n' \
+                          'StartTime: %s    Duration: %s'
+TRIAL_MONITOR_HEAD = '-------------------------------------------------------------------------------------\n' + \
+                    '%-15s %-25s %-25s %-15s \n' % ('trialId', 'startTime', 'endTime', 'status') + \
+                     '-------------------------------------------------------------------------------------'
+TRIAL_MONITOR_CONTENT = '%-15s %-25s %-25s %-15s'
+TRIAL_MONITOR_TAIL = '-------------------------------------------------------------------------------------\n\n\n'
 PACKAGE_REQUIREMENTS = {
    'SMAC': 'smac_tuner'

--- a/tools/nni_cmd/launcher.py
+++ b/tools/nni_cmd/launcher.py
@@ -366,8 +366,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
    nni_config.set_config('webuiUrl', web_ui_url_list)
    #save experiment information
-    experiment_config = Experiments()
+    nnictl_experiment_config = Experiments()
-    experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name)
+    nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name, experiment_config['trainingServicePlatform'])
    print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, '   '.join(web_ui_url_list)))

--- a/tools/nni_cmd/nnictl.py
+++ b/tools/nni_cmd/nnictl.py
@@ -161,6 +161,12 @@ def parse_args():
    parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment')
    parser_tensorboard_start.set_defaults(func=stop_tensorboard)
+    #parse top command
+    parser_top = subparsers.add_parser('top', help='monitor the experiment')
+    parser_top.add_argument('--time', '-t', dest='time', type=int, default=3, help='the time interval to update the experiment status, ' \
+    'the unit is second')
+    parser_top.set_defaults(func=monitor_experiment)
    args = parser.parse_args()
    args.func(args)

--- a/tools/nni_cmd/nnictl_utils.py
+++ b/tools/nni_cmd/nnictl_utils.py
@@ -27,8 +27,8 @@ from subprocess import call, check_output
 from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
 from .config_utils import Config, Experiments
 from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url
-from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT
+from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
-import time
+     EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL
 from .common_utils import print_normal, print_error, print_warning, detect_process
 def update_experiment_status():
@@ -68,7 +68,7 @@ def check_experiment_id(args):
            experiment_information = ""
            for key in running_experiment_list:
                experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
-                experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
+                experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
            print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
            exit(1)
        elif not running_experiment_list:
@@ -112,7 +112,7 @@ def parse_ids(args):
            experiment_information = ""
            for key in running_experiment_list:
                experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
-                experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
+                experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
            print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
            exit(1)
        else:
@@ -367,6 +367,69 @@ def experiment_list(args):
    experiment_information = ""
    for key in experiment_id_list:
        experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
-        experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
+        experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
    print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
+def get_time_interval(time1, time2):
+    '''get the interval of two times'''
+    try:
+        #convert time to timestamp
+        time1 = time.mktime(time.strptime(time1, '%Y-%m-%d %H:%M:%S'))
+        time2 = time.mktime(time.strptime(time2, '%Y-%m-%d %H:%M:%S'))
+        seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds
+        #convert seconds to day:hour:minute:second
+        days = seconds / 86400
+        seconds %= 86400
+        hours = seconds / 3600
+        seconds %= 3600
+        minutes = seconds / 60
+        seconds %= 60
+        return '%dd %dh %dm %ds' % (days, hours, minutes, seconds)
+    except:
+        return 'N/A'
+def show_experiment_info():
+    '''show experiment information in monitor'''
+    experiment_config = Experiments()
+    experiment_dict = experiment_config.get_all_experiments()
+    if not experiment_dict:
+        print('There is no experiment running...')
+        exit(1)
+    experiment_id_list = []
+    for key in experiment_dict.keys():
+        if experiment_dict[key]['status'] == 'running':
+            experiment_id_list.append(key)
+    if not experiment_id_list:
+        print_warning('There is no experiment running...')
+        return
+    for key in experiment_id_list:
+        current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
+        print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \
+             experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], current_time)))
+        print(TRIAL_MONITOR_HEAD)
+        running, response = check_rest_server_quick(experiment_dict[key]['port'])
+        if running:
+            response = rest_get(trial_jobs_url(experiment_dict[key]['port']), 20)
+            if response and check_response(response):
+                content = json.loads(response.text)
+                for index, value in enumerate(content):               
+                    content[index] = convert_time_stamp_to_date(value)
+                    print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), content[index].get('endTime'), content[index].get('status')))
+        print(TRIAL_MONITOR_TAIL)
+def monitor_experiment(args):
+    '''monitor the experiment'''
+    if args.time <= 0:
+        print_error('please input a positive integer as time interval, the unit is second.')
+        exit(1)
+    while True:
+        try:
+            os.system('clear')
+            update_experiment_status()
+            show_experiment_info()
+            time.sleep(args.time)
+        except KeyboardInterrupt:
+            exit(0)
+        except Exception as exception:
+            print_error(exception)
+            exit(1)