"host/host_tensor/src/device.cpp" did not exist on "fbdf4332c79a18454a553105ae5373911b2ba4ce"
Unverified Commit 07e19a30 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support 'nnictl top' (#464)

Add nnictl top command to monitor the nni experiments.
parent e31839cc
...@@ -16,6 +16,7 @@ nnictl config ...@@ -16,6 +16,7 @@ nnictl config
nnictl log nnictl log
nnictl webui nnictl webui
nnictl tensorboard nnictl tensorboard
nnictl top
``` ```
### Manage an experiment ### Manage an experiment
* __nnictl create__ * __nnictl create__
...@@ -172,7 +173,24 @@ nnictl tensorboard ...@@ -172,7 +173,24 @@ nnictl tensorboard
| ------ | ------ | ------ |------ | | ------ | ------ | ------ |------ |
| id| False| |ID of the experiment you want to set| | id| False| |ID of the experiment you want to set|
| --trialid, -t| True| |ID of the trial you want to kill.| | --trialid, -t| True| |ID of the trial you want to kill.|
* __nnictl top__
* Description
Monitor all of running experiments.
* Usage
nnictl top
Options:
| Name, shorthand | Required|Default | Description |
| ------ | ------ | ------ |------ |
| id| False| |ID of the experiment you want to set|
| --time, -t| False| |The interval to update the experiment status, the unit of time is second, and the default value is 3 second.|
### Manage experiment information ### Manage experiment information
......
...@@ -73,7 +73,7 @@ class Experiments: ...@@ -73,7 +73,7 @@ class Experiments:
self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment')
self.experiments = self.read_file() self.experiments = self.read_file()
def add_experiment(self, id, port, time, file_name): def add_experiment(self, id, port, time, file_name, platform):
'''set {key:value} paris to self.experiment''' '''set {key:value} paris to self.experiment'''
self.experiments[id] = {} self.experiments[id] = {}
self.experiments[id]['port'] = port self.experiments[id]['port'] = port
...@@ -81,6 +81,7 @@ class Experiments: ...@@ -81,6 +81,7 @@ class Experiments:
self.experiments[id]['endTime'] = 'N/A' self.experiments[id]['endTime'] = 'N/A'
self.experiments[id]['status'] = 'running' self.experiments[id]['status'] = 'running'
self.experiments[id]['fileName'] = file_name self.experiments[id]['fileName'] = file_name
self.experiments[id]['platform'] = platform
self.write_file() self.write_file()
def update_experiment(self, id, key, value): def update_experiment(self, id, key, value):
......
...@@ -40,11 +40,12 @@ EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0 ...@@ -40,11 +40,12 @@ EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0
' commands description\n' \ ' commands description\n' \
'1. nnictl experiment show show the information of experiments\n' \ '1. nnictl experiment show show the information of experiments\n' \
'2. nnictl trial ls list all of trial jobs\n' \ '2. nnictl trial ls list all of trial jobs\n' \
'3. nnictl log stderr show stderr log content\n' \ '3. nnictl top monitor the status of running experiments\n' \
'4. nnictl log stdout show stdout log content\n' \ '4. nnictl log stderr show stderr log content\n' \
'5. nnictl stop stop an experiment\n' \ '5. nnictl log stdout show stdout log content\n' \
'6. nnictl trial kill kill a trial job by id\n' \ '6. nnictl stop stop an experiment\n' \
'7. nnictl --help get help information about nnictl\n' \ '7. nnictl trial kill kill a trial job by id\n' \
'8. nnictl --help get help information about nnictl\n' \
'-----------------------------------------------------------------------\n' \ '-----------------------------------------------------------------------\n' \
LOG_HEADER = '-----------------------------------------------------------------------\n' \ LOG_HEADER = '-----------------------------------------------------------------------\n' \
...@@ -54,12 +55,23 @@ LOG_HEADER = '------------------------------------------------------------------ ...@@ -54,12 +55,23 @@ LOG_HEADER = '------------------------------------------------------------------
EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \ EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \
'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n'
EXPERIMENT_INFORMATION_FORMAT = '-----------------------------------------------------------------------\n' \ EXPERIMENT_INFORMATION_FORMAT = '----------------------------------------------------------------------------------------\n' \
' Experiment information\n' \ ' Experiment information\n' \
'%s\n' \ '%s\n' \
'-----------------------------------------------------------------------\n' '----------------------------------------------------------------------------------------\n'
EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s Port: %s StartTime: %s EndTime: %s \n' EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s Port: %s Platform: %s StartTime: %s EndTime: %s \n'
EXPERIMENT_MONITOR_INFO = 'Id: %s Status: %s Port: %s Platform: %s \n' \
'StartTime: %s Duration: %s'
TRIAL_MONITOR_HEAD = '-------------------------------------------------------------------------------------\n' + \
'%-15s %-25s %-25s %-15s \n' % ('trialId', 'startTime', 'endTime', 'status') + \
'-------------------------------------------------------------------------------------'
TRIAL_MONITOR_CONTENT = '%-15s %-25s %-25s %-15s'
TRIAL_MONITOR_TAIL = '-------------------------------------------------------------------------------------\n\n\n'
PACKAGE_REQUIREMENTS = { PACKAGE_REQUIREMENTS = {
'SMAC': 'smac_tuner' 'SMAC': 'smac_tuner'
......
...@@ -366,8 +366,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -366,8 +366,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
nni_config.set_config('webuiUrl', web_ui_url_list) nni_config.set_config('webuiUrl', web_ui_url_list)
#save experiment information #save experiment information
experiment_config = Experiments() nnictl_experiment_config = Experiments()
experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name) nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name, experiment_config['trainingServicePlatform'])
print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list)))
......
...@@ -161,6 +161,12 @@ def parse_args(): ...@@ -161,6 +161,12 @@ def parse_args():
parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment') parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment')
parser_tensorboard_start.set_defaults(func=stop_tensorboard) parser_tensorboard_start.set_defaults(func=stop_tensorboard)
#parse top command
parser_top = subparsers.add_parser('top', help='monitor the experiment')
parser_top.add_argument('--time', '-t', dest='time', type=int, default=3, help='the time interval to update the experiment status, ' \
'the unit is second')
parser_top.set_defaults(func=monitor_experiment)
args = parser.parse_args() args = parser.parse_args()
args.func(args) args.func(args)
......
...@@ -27,8 +27,8 @@ from subprocess import call, check_output ...@@ -27,8 +27,8 @@ from subprocess import call, check_output
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
from .config_utils import Config, Experiments from .config_utils import Config, Experiments
from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
import time EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL
from .common_utils import print_normal, print_error, print_warning, detect_process from .common_utils import print_normal, print_error, print_warning, detect_process
def update_experiment_status(): def update_experiment_status():
...@@ -68,7 +68,7 @@ def check_experiment_id(args): ...@@ -68,7 +68,7 @@ def check_experiment_id(args):
experiment_information = "" experiment_information = ""
for key in running_experiment_list: for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
exit(1) exit(1)
elif not running_experiment_list: elif not running_experiment_list:
...@@ -112,7 +112,7 @@ def parse_ids(args): ...@@ -112,7 +112,7 @@ def parse_ids(args):
experiment_information = "" experiment_information = ""
for key in running_experiment_list: for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
exit(1) exit(1)
else: else:
...@@ -367,6 +367,69 @@ def experiment_list(args): ...@@ -367,6 +367,69 @@ def experiment_list(args):
experiment_information = "" experiment_information = ""
for key in experiment_id_list: for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
def get_time_interval(time1, time2):
'''get the interval of two times'''
try:
#convert time to timestamp
time1 = time.mktime(time.strptime(time1, '%Y-%m-%d %H:%M:%S'))
time2 = time.mktime(time.strptime(time2, '%Y-%m-%d %H:%M:%S'))
seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds
#convert seconds to day:hour:minute:second
days = seconds / 86400
seconds %= 86400
hours = seconds / 3600
seconds %= 3600
minutes = seconds / 60
seconds %= 60
return '%dd %dh %dm %ds' % (days, hours, minutes, seconds)
except:
return 'N/A'
def show_experiment_info():
'''show experiment information in monitor'''
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict:
print('There is no experiment running...')
exit(1)
experiment_id_list = []
for key in experiment_dict.keys():
if experiment_dict[key]['status'] == 'running':
experiment_id_list.append(key)
if not experiment_id_list:
print_warning('There is no experiment running...')
return
for key in experiment_id_list:
current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], current_time)))
print(TRIAL_MONITOR_HEAD)
running, response = check_rest_server_quick(experiment_dict[key]['port'])
if running:
response = rest_get(trial_jobs_url(experiment_dict[key]['port']), 20)
if response and check_response(response):
content = json.loads(response.text)
for index, value in enumerate(content):
content[index] = convert_time_stamp_to_date(value)
print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), content[index].get('endTime'), content[index].get('status')))
print(TRIAL_MONITOR_TAIL)
def monitor_experiment(args):
'''monitor the experiment'''
if args.time <= 0:
print_error('please input a positive integer as time interval, the unit is second.')
exit(1)
while True:
try:
os.system('clear')
update_experiment_status()
show_experiment_info()
time.sleep(args.time)
except KeyboardInterrupt:
exit(0)
except Exception as exception:
print_error(exception)
exit(1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment