Unverified Commit d6bfe2a9 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Refactor nnictl and add config_pai.yml (#144)

* fix nnictl bug

* add hdfs host validation

* fix bugs

* fix dockerfile

* fix install.sh

* update install.sh

* fix dockerfile

* Set timeout for HDFSUtility exists function

* remove unused TODO

* fix sdk

* add optional for outputDir and dataDir

* refactor dockerfile.base

* Remove unused import in hdfsclientUtility

* add config_pai.yml

* refactor nnictl create logic and add colorful print

* fix nnictl stop logic

* add annotation for config_pai.yml

* add document for start experiment

* fix config.yml

* fix document
parent 5c042627
...@@ -30,13 +30,13 @@ from .launcher_utils import validate_all_content ...@@ -30,13 +30,13 @@ from .launcher_utils import validate_all_content
from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response
from .url_utils import cluster_metadata_url, experiment_url from .url_utils import cluster_metadata_url, experiment_url
from .config_utils import Config from .config_utils import Config
from .common_utils import get_yml_content, get_json_content, print_error, print_normal, detect_process from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process
from .constants import EXPERIMENT_SUCCESS_INFO, STDOUT_FULL_PATH, STDERR_FULL_PATH, LOG_DIR, REST_PORT, ERROR_INFO, NORMAL_INFO from .constants import *
from .webui_utils import start_web_ui, check_web_ui from .webui_utils import start_web_ui, check_web_ui
def start_rest_server(port, platform, mode, experiment_id=None): def start_rest_server(port, platform, mode, experiment_id=None):
'''Run nni manager process''' '''Run nni manager process'''
print_normal('Checking experiment...') print_normal('Checking environment...')
nni_config = Config() nni_config = Config()
rest_port = nni_config.get_config('restServerPort') rest_port = nni_config.get_config('restServerPort')
running, _ = check_rest_server_quick(rest_port) running, _ = check_rest_server_quick(rest_port)
...@@ -206,10 +206,9 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No ...@@ -206,10 +206,9 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No
experiment_config['searchSpace'] = json.dumps('') experiment_config['searchSpace'] = json.dumps('')
# check rest server # check rest server
print_normal('Checking restful server...')
running, _ = check_rest_server(REST_PORT) running, _ = check_rest_server(REST_PORT)
if running: if running:
print_normal('Restful server start success!') print_normal('Successfully started Restful server!')
else: else:
print_error('Restful server start failed!') print_error('Restful server start failed!')
try: try:
...@@ -238,7 +237,7 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No ...@@ -238,7 +237,7 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No
if experiment_config['trainingServicePlatform'] == 'local': if experiment_config['trainingServicePlatform'] == 'local':
print_normal('Setting local config...') print_normal('Setting local config...')
if set_local_config(experiment_config, REST_PORT): if set_local_config(experiment_config, REST_PORT):
print_normal('Success!') print_normal('Successfully set local config!')
else: else:
print_error('Failed!') print_error('Failed!')
try: try:
...@@ -253,7 +252,7 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No ...@@ -253,7 +252,7 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No
print_normal('Setting pai config...') print_normal('Setting pai config...')
config_result, err_msg = set_pai_config(experiment_config, REST_PORT) config_result, err_msg = set_pai_config(experiment_config, REST_PORT)
if config_result: if config_result:
print_normal('Success!') print_normal('Successfully set pai config!')
else: else:
if err_msg: if err_msg:
print_error('Failed! Error is: {}'.format(err_msg)) print_error('Failed! Error is: {}'.format(err_msg))
...@@ -261,8 +260,19 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No ...@@ -261,8 +260,19 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No
cmds = ['pkill', '-P', str(rest_process.pid)] cmds = ['pkill', '-P', str(rest_process.pid)]
call(cmds) call(cmds)
except Exception: except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!') raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(0) exit(0)
#start webui
if check_web_ui():
print_warning('{0} {1}'.format(' '.join(nni_config.get_config('webuiUrl')),'is being used, please stop it first!'))
print_normal('You can use \'nnictl webui stop\' to stop old Web UI process...')
else:
print_normal('Starting Web UI...')
webui_process = start_web_ui(webuiport)
if webui_process:
nni_config.set_config('webuiPid', webui_process.pid)
print_normal('Successfully started Web UI!')
# start a new experiment # start a new experiment
print_normal('Starting experiment...') print_normal('Starting experiment...')
...@@ -276,25 +286,12 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No ...@@ -276,25 +286,12 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No
try: try:
cmds = ['pkill', '-P', str(rest_process.pid)] cmds = ['pkill', '-P', str(rest_process.pid)]
call(cmds) call(cmds)
cmds = ['pkill', '-P', str(webui_process.pid)]
call(cmds)
except Exception: except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!') raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(0) exit(0)
print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, REST_PORT, ' '.join(nni_config.get_config('webuiUrl'))))
#start webui
print_normal('Checking web ui...')
if check_web_ui():
print_error('{0} {1}'.format(' '.join(nni_config.get_config('webuiUrl')),'is being used, please stop it first!'))
print_normal('You can use \'nnictl webui stop\' to stop old web ui process...')
else:
print_normal('Starting web ui...')
webui_process = start_web_ui(webuiport)
if webui_process:
nni_config.set_config('webuiPid', webui_process.pid)
print_normal('Starting web ui success!')
print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl'))))
print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, REST_PORT))
def resume_experiment(args): def resume_experiment(args):
'''resume an experiment''' '''resume an experiment'''
......
...@@ -64,17 +64,20 @@ def stop_experiment(args): ...@@ -64,17 +64,20 @@ def stop_experiment(args):
stop_web_ui() stop_web_ui()
return return
running, _ = check_rest_server_quick(rest_port) running, _ = check_rest_server_quick(rest_port)
stop_rest_result = True
if running: if running:
response = rest_delete(experiment_url(rest_port), 20) response = rest_delete(experiment_url(rest_port), 20)
if not response or not check_response(response): if not response or not check_response(response):
print_error('Stop experiment failed!') print_error('Stop experiment failed!')
stop_rest_result = False
#sleep to wait rest handler done #sleep to wait rest handler done
time.sleep(3) time.sleep(3)
rest_pid = nni_config.get_config('restServerPid') rest_pid = nni_config.get_config('restServerPid')
cmds = ['pkill', '-P', str(rest_pid)] cmds = ['pkill', '-P', str(rest_pid)]
call(cmds) call(cmds)
stop_web_ui() stop_web_ui()
print_normal('Stop experiment success!') if stop_rest_result:
print_normal('Stop experiment success!')
def trial_ls(args): def trial_ls(args):
'''List trial''' '''List trial'''
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment