Unverified Commit bf2b9290 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support monitor mode when creating or resuming a new experiment (#1933)

parent 4ed78edd
......@@ -49,6 +49,7 @@ nnictl support commands:
|--config, -c| True| |YAML configure file of the experiment|
|--port, -p|False| |the port of restful server|
|--debug, -d|False||set debug mode|
|--watch, -w|False||set watch mode|
* Examples
......@@ -97,6 +98,7 @@ Debug mode will disable version check function in Trialkeeper.
|id| True| |The id of the experiment you want to resume|
|--port, -p| False| |Rest port of the experiment you want to resume|
|--debug, -d|False||set debug mode|
|--watch, -w|False||set watch mode|
* Example
......
# list of commands/arguments
__nnictl_cmds="create resume view update stop trial experiment platform import export webui config log package tensorboard top"
__nnictl_create_cmds="--config --port --debug"
__nnictl_resume_cmds="--port --debug"
__nnictl_create_cmds="--config --port --debug --watch"
__nnictl_resume_cmds="--port --debug --watch"
__nnictl_view_cmds="--port"
__nnictl_update_cmds="searchspace concurrency duration trialnum"
__nnictl_update_searchspace_cmds="--filename"
......
......@@ -20,7 +20,7 @@ from .common_utils import get_yml_content, get_json_content, print_error, print_
detect_port, get_user, get_python_dir
from .constants import NNICTL_HOME_DIR, ERROR_INFO, REST_TIME_OUT, EXPERIMENT_SUCCESS_INFO, LOG_HEADER, PACKAGE_REQUIREMENTS
from .command_utils import check_output_command, kill_command
from .nnictl_utils import update_experiment
from .nnictl_utils import update_experiment, set_monitor
def get_log_path(config_file_name):
'''generate stdout and stderr log path'''
......@@ -493,6 +493,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
experiment_config['experimentName'])
print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list)))
if args.watch:
set_monitor(True, 3, args.port, rest_process.pid)
def create_experiment(args):
'''start a new experiment'''
......@@ -506,8 +508,8 @@ def create_experiment(args):
validate_all_content(experiment_config, config_path)
nni_config.set_config('experimentConfig', experiment_config)
launch_experiment(args, experiment_config, 'new', config_file_name)
nni_config.set_config('restServerPort', args.port)
launch_experiment(args, experiment_config, 'new', config_file_name)
def manage_stopped_experiment(args, mode):
'''view a stopped experiment'''
......
......@@ -51,6 +51,7 @@ def parse_args():
parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_start.add_argument('--debug', '-d', action='store_true', help=' set debug mode')
parser_start.add_argument('--watch', '-w', action='store_true', help=' set watch mode')
parser_start.set_defaults(func=create_experiment)
# parse resume command
......@@ -58,6 +59,7 @@ def parse_args():
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set debug mode')
parser_resume.add_argument('--watch', '-w', action='store_true', help=' set watch mode')
parser_resume.set_defaults(func=resume_experiment)
# parse view command
......
......@@ -3,6 +3,7 @@
import csv
import os
import sys
import json
import time
import re
......@@ -623,23 +624,44 @@ def show_experiment_info():
content[index].get('endTime'), content[index].get('status')))
print(TRIAL_MONITOR_TAIL)
def monitor_experiment(args):
'''monitor the experiment'''
if args.time <= 0:
print_error('please input a positive integer as time interval, the unit is second.')
exit(1)
def set_monitor(auto_exit, time_interval, port=None, pid=None):
'''set the experiment monitor engine'''
while True:
try:
os.system('clear')
if sys.platform == 'win32':
os.system('cls')
else:
os.system('clear')
update_experiment()
show_experiment_info()
time.sleep(args.time)
if auto_exit:
status = get_experiment_status(port)
if status in ['DONE', 'ERROR', 'STOPPED']:
print_normal('Experiment status is {0}.'.format(status))
print_normal('Stopping experiment...')
kill_command(pid)
print_normal('Stop experiment success.')
exit(0)
time.sleep(time_interval)
except KeyboardInterrupt:
if auto_exit:
print_normal('Stopping experiment...')
kill_command(pid)
print_normal('Stop experiment success.')
else:
print_normal('Exiting...')
exit(0)
except Exception as exception:
print_error(exception)
exit(1)
def monitor_experiment(args):
'''monitor the experiment'''
if args.time <= 0:
print_error('please input a positive integer as time interval, the unit is second.')
exit(1)
set_monitor(False, args.time)
def export_trials_data(args):
'''export experiment metadata to csv
'''
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment