Unverified Commit afce6d4a authored by fishyds's avatar fishyds Committed by GitHub
Browse files

Merge pull request #950 from Microsoft/v0.6

Merge V0.6 branch to master
parents 6545540d 29a23335
...@@ -192,17 +192,20 @@ class SMACTuner(Tuner): ...@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
Returns Returns
------- -------
dict dict
challenger dict dict which stores copy of challengers
""" """
converted_dict = {}
for key, value in challenger_dict.items(): for key, value in challenger_dict.items():
# convert to loguniform # convert to loguniform
if key in self.loguniform_key: if key in self.loguniform_key:
challenger_dict[key] = np.exp(challenger_dict[key]) converted_dict[key] = np.exp(challenger_dict[key])
# convert categorical back to original value # convert categorical back to original value
if key in self.categorical_dict: elif key in self.categorical_dict:
idx = challenger_dict[key] idx = challenger_dict[key]
challenger_dict[key] = self.categorical_dict[key][idx] converted_dict[key] = self.categorical_dict[key][idx]
return challenger_dict else:
converted_dict[key] = value
return converted_dict
def generate_parameters(self, parameter_id): def generate_parameters(self, parameter_id):
"""generate one instance of hyperparameters """generate one instance of hyperparameters
...@@ -220,13 +223,11 @@ class SMACTuner(Tuner): ...@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
if self.first_one: if self.first_one:
init_challenger = self.smbo_solver.nni_smac_start() init_challenger = self.smbo_solver.nni_smac_start()
self.total_data[parameter_id] = init_challenger self.total_data[parameter_id] = init_challenger
json_tricks.dumps(init_challenger.get_dictionary())
return self.convert_loguniform_categorical(init_challenger.get_dictionary()) return self.convert_loguniform_categorical(init_challenger.get_dictionary())
else: else:
challengers = self.smbo_solver.nni_smac_request_challengers() challengers = self.smbo_solver.nni_smac_request_challengers()
for challenger in challengers: for challenger in challengers:
self.total_data[parameter_id] = challenger self.total_data[parameter_id] = challenger
json_tricks.dumps(challenger.get_dictionary())
return self.convert_loguniform_categorical(challenger.get_dictionary()) return self.convert_loguniform_categorical(challenger.get_dictionary())
def generate_multiple_parameters(self, parameter_id_list): def generate_multiple_parameters(self, parameter_id_list):
...@@ -247,7 +248,6 @@ class SMACTuner(Tuner): ...@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
for one_id in parameter_id_list: for one_id in parameter_id_list:
init_challenger = self.smbo_solver.nni_smac_start() init_challenger = self.smbo_solver.nni_smac_start()
self.total_data[one_id] = init_challenger self.total_data[one_id] = init_challenger
json_tricks.dumps(init_challenger.get_dictionary())
params.append(self.convert_loguniform_categorical(init_challenger.get_dictionary())) params.append(self.convert_loguniform_categorical(init_challenger.get_dictionary()))
else: else:
challengers = self.smbo_solver.nni_smac_request_challengers() challengers = self.smbo_solver.nni_smac_request_challengers()
...@@ -257,7 +257,6 @@ class SMACTuner(Tuner): ...@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
if cnt >= len(parameter_id_list): if cnt >= len(parameter_id_list):
break break
self.total_data[parameter_id_list[cnt]] = challenger self.total_data[parameter_id_list[cnt]] = challenger
json_tricks.dumps(challenger.get_dictionary())
params.append(self.convert_loguniform_categorical(challenger.get_dictionary())) params.append(self.convert_loguniform_categorical(challenger.get_dictionary()))
cnt += 1 cnt += 1
return params return params
...@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss'); ...@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
interface OverviewState { interface OverviewState {
tableData: Array<TableObj>; tableData: Array<TableObj>;
experimentAPI: object;
searchSpace: object; searchSpace: object;
status: string; status: string;
errorStr: string; errorStr: string;
...@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> { ...@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
super(props); super(props);
this.state = { this.state = {
searchSpace: {}, searchSpace: {},
experimentAPI: {},
status: '', status: '',
errorStr: '', errorStr: '',
trialProfile: { trialProfile: {
...@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> { ...@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
}); });
if (this._isMounted) { if (this._isMounted) {
this.setState({ this.setState({
experimentAPI: res.data,
trialProfile: trialPro[0], trialProfile: trialPro[0],
searchSpace: searchSpace, searchSpace: searchSpace,
isLogCollection: expLogCollection isLogCollection: expLogCollection
...@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> { ...@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
const { const {
trialProfile, searchSpace, tableData, accuracyData, trialProfile, searchSpace, tableData, accuracyData,
accNodata, status, errorStr, trialNumber, bestAccuracy, accNodata, status, errorStr, trialNumber, bestAccuracy,
titleMaxbgcolor, titleMinbgcolor, isLogCollection titleMaxbgcolor, titleMinbgcolor, isLogCollection, experimentAPI
} = this.state; } = this.state;
return ( return (
...@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> { ...@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
<Row className="experiment"> <Row className="experiment">
{/* the scroll bar all the trial profile in the searchSpace div*/} {/* the scroll bar all the trial profile in the searchSpace div*/}
<div className="experiment searchSpace"> <div className="experiment searchSpace">
<TrialPro <TrialPro experiment={experimentAPI} />
tiralProInfo={trialProfile}
/>
</div> </div>
</Row> </Row>
</Col> </Col>
......
import * as React from 'react'; import * as React from 'react';
import { Experiment } from '../../static/interface';
import MonacoEditor from 'react-monaco-editor'; import MonacoEditor from 'react-monaco-editor';
import { MONACO } from '../../static/const'; import { MONACO } from '../../static/const';
interface TrialInfoProps { interface TrialInfoProps {
tiralProInfo: Experiment; experiment: object;
} }
class TrialInfo extends React.Component<TrialInfoProps, {}> { class TrialInfo extends React.Component<TrialInfoProps, {}> {
...@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> { ...@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
super(props); super(props);
} }
render() { componentWillReceiveProps(nextProps: TrialInfoProps) {
const { tiralProInfo } = this.props; const experiments = nextProps.experiment;
const showProInfo = []; Object.keys(experiments).map(key => {
showProInfo.push({ switch (key) {
revision: tiralProInfo.revision, case 'id':
authorName: tiralProInfo.author, case 'logDir':
trialConcurrency: tiralProInfo.runConcurren, case 'startTime':
tuner: tiralProInfo.tuner, case 'endTime':
assessor: tiralProInfo.assessor ? tiralProInfo.assessor : undefined, experiments[key] = undefined;
logCollection: tiralProInfo.logCollection ? tiralProInfo.logCollection : undefined, break;
advisor: tiralProInfo.advisor ? tiralProInfo.advisor : undefined, case 'params':
clusterMetaData: tiralProInfo.clusterMetaData ? tiralProInfo.clusterMetaData : undefined const params = experiments[key];
Object.keys(params).map(item => {
if (item === 'experimentName' || item === 'searchSpace'
|| item === 'trainingServicePlatform') {
params[item] = undefined;
}
});
break;
default:
}
}); });
}
render() {
const { experiment } = this.props;
return ( return (
<div className="profile"> <div className="profile">
<MonacoEditor <MonacoEditor
...@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> { ...@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
height="380" height="380"
language="json" language="json"
theme="vs-light" theme="vs-light"
value={JSON.stringify(showProInfo[0], null, 2)} value={JSON.stringify(experiment, null, 2)}
options={MONACO} options={MONACO}
/> />
</div> </div>
...@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> { ...@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
} }
} }
export default TrialInfo; export default TrialInfo;
\ No newline at end of file
...@@ -79,7 +79,7 @@ class Experiments: ...@@ -79,7 +79,7 @@ class Experiments:
self.experiments[id]['port'] = port self.experiments[id]['port'] = port
self.experiments[id]['startTime'] = time self.experiments[id]['startTime'] = time
self.experiments[id]['endTime'] = 'N/A' self.experiments[id]['endTime'] = 'N/A'
self.experiments[id]['status'] = 'running' self.experiments[id]['status'] = 'INITIALIZED'
self.experiments[id]['fileName'] = file_name self.experiments[id]['fileName'] = file_name
self.experiments[id]['platform'] = platform self.experiments[id]['platform'] = platform
self.write_file() self.write_file()
......
...@@ -30,6 +30,8 @@ WARNING_INFO = 'WARNING: %s' ...@@ -30,6 +30,8 @@ WARNING_INFO = 'WARNING: %s'
DEFAULT_REST_PORT = 8080 DEFAULT_REST_PORT = 8080
REST_TIME_OUT = 20
EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0m' \ EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0m' \
'-----------------------------------------------------------------------\n' \ '-----------------------------------------------------------------------\n' \
'The experiment id is %s\n'\ 'The experiment id is %s\n'\
......
...@@ -139,7 +139,7 @@ def set_trial_config(experiment_config, port, config_file_name): ...@@ -139,7 +139,7 @@ def set_trial_config(experiment_config, port, config_file_name):
'''set trial configuration''' '''set trial configuration'''
request_data = dict() request_data = dict()
request_data['trial_config'] = experiment_config['trial'] request_data['trial_config'] = experiment_config['trial']
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT)
if check_response(response): if check_response(response):
return True return True
else: else:
...@@ -159,7 +159,7 @@ def set_remote_config(experiment_config, port, config_file_name): ...@@ -159,7 +159,7 @@ def set_remote_config(experiment_config, port, config_file_name):
#set machine_list #set machine_list
request_data = dict() request_data = dict()
request_data['machine_list'] = experiment_config['machineList'] request_data['machine_list'] = experiment_config['machineList']
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT)
err_message = '' err_message = ''
if not response or not check_response(response): if not response or not check_response(response):
if response is not None: if response is not None:
...@@ -180,7 +180,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name): ...@@ -180,7 +180,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name):
return True, None return True, None
ip_config_dict = dict() ip_config_dict = dict()
ip_config_dict['nni_manager_ip'] = { 'nniManagerIp' : experiment_config['nniManagerIp'] } ip_config_dict['nni_manager_ip'] = { 'nniManagerIp' : experiment_config['nniManagerIp'] }
response = rest_put(cluster_metadata_url(port), json.dumps(ip_config_dict), 20) response = rest_put(cluster_metadata_url(port), json.dumps(ip_config_dict), REST_TIME_OUT)
err_message = None err_message = None
if not response or not response.status_code == 200: if not response or not response.status_code == 200:
if response is not None: if response is not None:
...@@ -195,7 +195,7 @@ def set_pai_config(experiment_config, port, config_file_name): ...@@ -195,7 +195,7 @@ def set_pai_config(experiment_config, port, config_file_name):
'''set pai configuration''' '''set pai configuration'''
pai_config_data = dict() pai_config_data = dict()
pai_config_data['pai_config'] = experiment_config['paiConfig'] pai_config_data['pai_config'] = experiment_config['paiConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(pai_config_data), 20) response = rest_put(cluster_metadata_url(port), json.dumps(pai_config_data), REST_TIME_OUT)
err_message = None err_message = None
if not response or not response.status_code == 200: if not response or not response.status_code == 200:
if response is not None: if response is not None:
...@@ -214,7 +214,7 @@ def set_kubeflow_config(experiment_config, port, config_file_name): ...@@ -214,7 +214,7 @@ def set_kubeflow_config(experiment_config, port, config_file_name):
'''set kubeflow configuration''' '''set kubeflow configuration'''
kubeflow_config_data = dict() kubeflow_config_data = dict()
kubeflow_config_data['kubeflow_config'] = experiment_config['kubeflowConfig'] kubeflow_config_data['kubeflow_config'] = experiment_config['kubeflowConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(kubeflow_config_data), 20) response = rest_put(cluster_metadata_url(port), json.dumps(kubeflow_config_data), REST_TIME_OUT)
err_message = None err_message = None
if not response or not response.status_code == 200: if not response or not response.status_code == 200:
if response is not None: if response is not None:
...@@ -233,7 +233,7 @@ def set_frameworkcontroller_config(experiment_config, port, config_file_name): ...@@ -233,7 +233,7 @@ def set_frameworkcontroller_config(experiment_config, port, config_file_name):
'''set kubeflow configuration''' '''set kubeflow configuration'''
frameworkcontroller_config_data = dict() frameworkcontroller_config_data = dict()
frameworkcontroller_config_data['frameworkcontroller_config'] = experiment_config['frameworkcontrollerConfig'] frameworkcontroller_config_data['frameworkcontroller_config'] = experiment_config['frameworkcontrollerConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(frameworkcontroller_config_data), 20) response = rest_put(cluster_metadata_url(port), json.dumps(frameworkcontroller_config_data), REST_TIME_OUT)
err_message = None err_message = None
if not response or not response.status_code == 200: if not response or not response.status_code == 200:
if response is not None: if response is not None:
...@@ -304,7 +304,7 @@ def set_experiment(experiment_config, mode, port, config_file_name): ...@@ -304,7 +304,7 @@ def set_experiment(experiment_config, mode, port, config_file_name):
request_data['clusterMetaData'].append( request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']}) {'key': 'trial_config', 'value': experiment_config['trial']})
response = rest_post(experiment_url(port), json.dumps(request_data), 20) response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT)
if check_response(response): if check_response(response):
return response return response
else: else:
...@@ -488,7 +488,7 @@ def resume_experiment(args): ...@@ -488,7 +488,7 @@ def resume_experiment(args):
if experiment_dict.get(args.id) is None: if experiment_dict.get(args.id) is None:
print_error('Id %s not exist!' % args.id) print_error('Id %s not exist!' % args.id)
exit(1) exit(1)
if experiment_dict[args.id]['status'] == 'running': if experiment_dict[args.id]['status'] != 'STOPPED':
print_error('Experiment %s is running!' % args.id) print_error('Experiment %s is running!' % args.id)
exit(1) exit(1)
experiment_id = args.id experiment_id = args.id
......
...@@ -28,10 +28,25 @@ from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_re ...@@ -28,10 +28,25 @@ from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_re
from .config_utils import Config, Experiments from .config_utils import Config, Experiments
from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT
from .common_utils import print_normal, print_error, print_warning, detect_process from .common_utils import print_normal, print_error, print_warning, detect_process
def update_experiment_status(): def get_experiment_time(port):
'''get the startTime and endTime of an experiment'''
response = rest_get(experiment_url(port), REST_TIME_OUT)
if response and check_response(response):
content = convert_time_stamp_to_date(json.loads(response.text))
return content.get('startTime'), content.get('endTime')
return None, None
def get_experiment_status(port):
'''get the status of an experiment'''
result, response = check_rest_server_quick(port)
if result:
return json.loads(response.text).get('status')
return None
def update_experiment():
'''Update the experiment status in config file''' '''Update the experiment status in config file'''
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
...@@ -39,16 +54,26 @@ def update_experiment_status(): ...@@ -39,16 +54,26 @@ def update_experiment_status():
return None return None
for key in experiment_dict.keys(): for key in experiment_dict.keys():
if isinstance(experiment_dict[key], dict): if isinstance(experiment_dict[key], dict):
if experiment_dict[key].get('status') == 'running': if experiment_dict[key].get('status') != 'STOPPED':
nni_config = Config(experiment_dict[key]['fileName']) nni_config = Config(experiment_dict[key]['fileName'])
rest_pid = nni_config.get_config('restServerPid') rest_pid = nni_config.get_config('restServerPid')
if not detect_process(rest_pid): if not detect_process(rest_pid):
experiment_config.update_experiment(key, 'status', 'stopped') experiment_config.update_experiment(key, 'status', 'STOPPED')
continue
rest_port = nni_config.get_config('restServerPort')
startTime, endTime = get_experiment_time(rest_port)
if startTime:
experiment_config.update_experiment(key, 'startTime', startTime)
if endTime:
experiment_config.update_experiment(key, 'endTime', endTime)
status = get_experiment_status(rest_port)
if status:
experiment_config.update_experiment(key, 'status', status)
def check_experiment_id(args): def check_experiment_id(args):
'''check if the id is valid '''check if the id is valid
''' '''
update_experiment_status() update_experiment()
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict: if not experiment_dict:
...@@ -58,13 +83,13 @@ def check_experiment_id(args): ...@@ -58,13 +83,13 @@ def check_experiment_id(args):
running_experiment_list = [] running_experiment_list = []
for key in experiment_dict.keys(): for key in experiment_dict.keys():
if isinstance(experiment_dict[key], dict): if isinstance(experiment_dict[key], dict):
if experiment_dict[key].get('status') == 'running': if experiment_dict[key].get('status') != 'STOPPED':
running_experiment_list.append(key) running_experiment_list.append(key)
elif isinstance(experiment_dict[key], list): elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file # if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key) experiment_config.remove_experiment(key)
if len(running_experiment_list) > 1: if len(running_experiment_list) > 1:
print_error('There are multiple experiments running, please set the experiment id...') print_error('There are multiple experiments, please set the experiment id...')
experiment_information = "" experiment_information = ""
for key in running_experiment_list: for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
...@@ -94,7 +119,7 @@ def parse_ids(args): ...@@ -94,7 +119,7 @@ def parse_ids(args):
5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id
6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information
''' '''
update_experiment_status() update_experiment()
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict: if not experiment_dict:
...@@ -104,14 +129,14 @@ def parse_ids(args): ...@@ -104,14 +129,14 @@ def parse_ids(args):
running_experiment_list = [] running_experiment_list = []
for key in experiment_dict.keys(): for key in experiment_dict.keys():
if isinstance(experiment_dict[key], dict): if isinstance(experiment_dict[key], dict):
if experiment_dict[key].get('status') == 'running': if experiment_dict[key].get('status') != 'STOPPED':
running_experiment_list.append(key) running_experiment_list.append(key)
elif isinstance(experiment_dict[key], list): elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file # if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key) experiment_config.remove_experiment(key)
if not args.id: if not args.id:
if len(running_experiment_list) > 1: if len(running_experiment_list) > 1:
print_error('There are multiple experiments running, please set the experiment id...') print_error('There are multiple experiments, please set the experiment id...')
experiment_information = "" experiment_information = ""
for key in running_experiment_list: for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
...@@ -207,7 +232,7 @@ def stop_experiment(args): ...@@ -207,7 +232,7 @@ def stop_experiment(args):
print_error(exception) print_error(exception)
nni_config.set_config('tensorboardPidList', []) nni_config.set_config('tensorboardPidList', [])
print_normal('Stop experiment success!') print_normal('Stop experiment success!')
experiment_config.update_experiment(experiment_id, 'status', 'stopped') experiment_config.update_experiment(experiment_id, 'status', 'STOPPED')
time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) experiment_config.update_experiment(experiment_id, 'endTime', str(time_now))
...@@ -221,7 +246,7 @@ def trial_ls(args): ...@@ -221,7 +246,7 @@ def trial_ls(args):
return return
running, response = check_rest_server_quick(rest_port) running, response = check_rest_server_quick(rest_port)
if running: if running:
response = rest_get(trial_jobs_url(rest_port), 20) response = rest_get(trial_jobs_url(rest_port), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
content = json.loads(response.text) content = json.loads(response.text)
for index, value in enumerate(content): for index, value in enumerate(content):
...@@ -242,7 +267,7 @@ def trial_kill(args): ...@@ -242,7 +267,7 @@ def trial_kill(args):
return return
running, _ = check_rest_server_quick(rest_port) running, _ = check_rest_server_quick(rest_port)
if running: if running:
response = rest_delete(trial_job_id_url(rest_port, args.id), 20) response = rest_delete(trial_job_id_url(rest_port, args.id), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
print(response.text) print(response.text)
else: else:
...@@ -260,7 +285,7 @@ def list_experiment(args): ...@@ -260,7 +285,7 @@ def list_experiment(args):
return return
running, _ = check_rest_server_quick(rest_port) running, _ = check_rest_server_quick(rest_port)
if running: if running:
response = rest_get(experiment_url(rest_port), 20) response = rest_get(experiment_url(rest_port), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
content = convert_time_stamp_to_date(json.loads(response.text)) content = convert_time_stamp_to_date(json.loads(response.text))
print(json.dumps(content, indent=4, sort_keys=True, separators=(',', ':'))) print(json.dumps(content, indent=4, sort_keys=True, separators=(',', ':')))
...@@ -322,7 +347,7 @@ def log_trial(args): ...@@ -322,7 +347,7 @@ def log_trial(args):
return return
running, response = check_rest_server_quick(rest_port) running, response = check_rest_server_quick(rest_port)
if running: if running:
response = rest_get(trial_jobs_url(rest_port), 20) response = rest_get(trial_jobs_url(rest_port), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
content = json.loads(response.text) content = json.loads(response.text)
for trial in content: for trial in content:
...@@ -362,18 +387,20 @@ def experiment_list(args): ...@@ -362,18 +387,20 @@ def experiment_list(args):
if not experiment_dict: if not experiment_dict:
print('There is no experiment running...') print('There is no experiment running...')
exit(1) exit(1)
update_experiment()
experiment_id_list = [] experiment_id_list = []
if args.all and args.all == 'all': if args.all and args.all == 'all':
for key in experiment_dict.keys(): for key in experiment_dict.keys():
experiment_id_list.append(key) experiment_id_list.append(key)
else: else:
for key in experiment_dict.keys(): for key in experiment_dict.keys():
if experiment_dict[key]['status'] == 'running': if experiment_dict[key]['status'] != 'STOPPED':
experiment_id_list.append(key) experiment_id_list.append(key)
if not experiment_id_list: if not experiment_id_list:
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!') print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!')
experiment_information = "" experiment_information = ""
for key in experiment_id_list: for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
...@@ -382,8 +409,8 @@ def get_time_interval(time1, time2): ...@@ -382,8 +409,8 @@ def get_time_interval(time1, time2):
'''get the interval of two times''' '''get the interval of two times'''
try: try:
#convert time to timestamp #convert time to timestamp
time1 = time.mktime(time.strptime(time1, '%Y-%m-%d %H:%M:%S')) time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S'))
time2 = time.mktime(time.strptime(time2, '%Y-%m-%d %H:%M:%S')) time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S'))
seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds
#convert seconds to day:hour:minute:second #convert seconds to day:hour:minute:second
days = seconds / 86400 days = seconds / 86400
...@@ -403,21 +430,21 @@ def show_experiment_info(): ...@@ -403,21 +430,21 @@ def show_experiment_info():
if not experiment_dict: if not experiment_dict:
print('There is no experiment running...') print('There is no experiment running...')
exit(1) exit(1)
update_experiment()
experiment_id_list = [] experiment_id_list = []
for key in experiment_dict.keys(): for key in experiment_dict.keys():
if experiment_dict[key]['status'] == 'running': if experiment_dict[key]['status'] != 'STOPPED':
experiment_id_list.append(key) experiment_id_list.append(key)
if not experiment_id_list: if not experiment_id_list:
print_warning('There is no experiment running...') print_warning('There is no experiment running...')
return return
for key in experiment_id_list: for key in experiment_id_list:
current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \ print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], current_time))) experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])))
print(TRIAL_MONITOR_HEAD) print(TRIAL_MONITOR_HEAD)
running, response = check_rest_server_quick(experiment_dict[key]['port']) running, response = check_rest_server_quick(experiment_dict[key]['port'])
if running: if running:
response = rest_get(trial_jobs_url(experiment_dict[key]['port']), 20) response = rest_get(trial_jobs_url(experiment_dict[key]['port']), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
content = json.loads(response.text) content = json.loads(response.text)
for index, value in enumerate(content): for index, value in enumerate(content):
...@@ -433,7 +460,7 @@ def monitor_experiment(args): ...@@ -433,7 +460,7 @@ def monitor_experiment(args):
while True: while True:
try: try:
os.system('clear') os.system('clear')
update_experiment_status() update_experiment()
show_experiment_info() show_experiment_info()
time.sleep(args.time) time.sleep(args.time)
except KeyboardInterrupt: except KeyboardInterrupt:
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
import time import time
import requests import requests
from .url_utils import check_status_url from .url_utils import check_status_url
from .constants import REST_TIME_OUT
def rest_put(url, data, timeout): def rest_put(url, data, timeout):
'''Call rest put method''' '''Call rest put method'''
...@@ -61,7 +62,7 @@ def check_rest_server(rest_port): ...@@ -61,7 +62,7 @@ def check_rest_server(rest_port):
'''Check if restful server is ready''' '''Check if restful server is ready'''
retry_count = 5 retry_count = 5
for _ in range(retry_count): for _ in range(retry_count):
response = rest_get(check_status_url(rest_port), 20) response = rest_get(check_status_url(rest_port), REST_TIME_OUT)
if response: if response:
if response.status_code == 200: if response.status_code == 200:
return True, response return True, response
......
...@@ -144,7 +144,7 @@ def start_tensorboard(args): ...@@ -144,7 +144,7 @@ def start_tensorboard(args):
running, response = check_rest_server_quick(rest_port) running, response = check_rest_server_quick(rest_port)
trial_content = None trial_content = None
if running: if running:
response = rest_get(trial_jobs_url(rest_port), 20) response = rest_get(trial_jobs_url(rest_port), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
trial_content = json.loads(response.text) trial_content = json.loads(response.text)
else: else:
......
...@@ -27,6 +27,7 @@ from .config_utils import Config ...@@ -27,6 +27,7 @@ from .config_utils import Config
from .common_utils import get_json_content from .common_utils import get_json_content
from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename
from .launcher_utils import parse_time from .launcher_utils import parse_time
from .constants import REST_TIME_OUT
def validate_digit(value, start, end): def validate_digit(value, start, end):
'''validate if a digit is valid''' '''validate if a digit is valid'''
...@@ -62,11 +63,11 @@ def update_experiment_profile(args, key, value): ...@@ -62,11 +63,11 @@ def update_experiment_profile(args, key, value):
rest_port = nni_config.get_config('restServerPort') rest_port = nni_config.get_config('restServerPort')
running, _ = check_rest_server_quick(rest_port) running, _ = check_rest_server_quick(rest_port)
if running: if running:
response = rest_get(experiment_url(rest_port), 20) response = rest_get(experiment_url(rest_port), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
experiment_profile = json.loads(response.text) experiment_profile = json.loads(response.text)
experiment_profile['params'][key] = value experiment_profile['params'][key] = value
response = rest_put(experiment_url(rest_port)+get_query_type(key), json.dumps(experiment_profile), 20) response = rest_put(experiment_url(rest_port)+get_query_type(key), json.dumps(experiment_profile), REST_TIME_OUT)
if response and check_response(response): if response and check_response(response):
return response return response
else: else:
......
...@@ -35,6 +35,7 @@ STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout') ...@@ -35,6 +35,7 @@ STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')
STDERR_FULL_PATH = os.path.join(LOG_DIR, 'stderr') STDERR_FULL_PATH = os.path.join(LOG_DIR, 'stderr')
STDOUT_API = '/stdout' STDOUT_API = '/stdout'
VERSION_API = '/version'
NNI_SYS_DIR = os.environ['NNI_SYS_DIR'] NNI_SYS_DIR = os.environ['NNI_SYS_DIR']
NNI_TRIAL_JOB_ID = os.environ['NNI_TRIAL_JOB_ID'] NNI_TRIAL_JOB_ID = os.environ['NNI_TRIAL_JOB_ID']
NNI_EXP_ID = os.environ['NNI_EXP_ID'] NNI_EXP_ID = os.environ['NNI_EXP_ID']
\ No newline at end of file
...@@ -27,14 +27,18 @@ import shlex ...@@ -27,14 +27,18 @@ import shlex
import re import re
import sys import sys
import select import select
import json
from pyhdfs import HdfsClient from pyhdfs import HdfsClient
import pkg_resources import pkg_resources
from .rest_utils import rest_post
from .url_utils import gen_send_stdout_url, gen_send_version_url
from .constants import HOME_DIR, LOG_DIR, NNI_PLATFORM, STDOUT_FULL_PATH, STDERR_FULL_PATH from .constants import HOME_DIR, LOG_DIR, NNI_PLATFORM, STDOUT_FULL_PATH, STDERR_FULL_PATH
from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal
from .log_utils import LogType, nni_log, RemoteLogger, PipeLogReader, StdOutputType from .log_utils import LogType, nni_log, RemoteLogger, PipeLogReader, StdOutputType
logger = logging.getLogger('trial_keeper') logger = logging.getLogger('trial_keeper')
regular = re.compile('v?(?P<version>[0-9](\.[0-9]){0,1}).*')
def main_loop(args): def main_loop(args):
'''main loop logic for trial keeper''' '''main loop logic for trial keeper'''
...@@ -110,21 +114,27 @@ def check_version(args): ...@@ -110,21 +114,27 @@ def check_version(args):
#package nni does not exist, try nni-tool package #package nni does not exist, try nni-tool package
nni_log(LogType.Error, 'Package nni does not exist!') nni_log(LogType.Error, 'Package nni does not exist!')
os._exit(1) os._exit(1)
if not args.version: if not args.nni_manager_version:
# skip version check # skip version check
nni_log(LogType.Warning, 'Skipping version check!') nni_log(LogType.Warning, 'Skipping version check!')
else: else:
regular = re.compile('v?(?P<version>[0-9](\.[0-9]){0,2}).*')
try: try:
trial_keeper_version = regular.search(trial_keeper_version).group('version') trial_keeper_version = regular.search(trial_keeper_version).group('version')
nni_log(LogType.Info, 'trial_keeper_version is {0}'.format(trial_keeper_version)) nni_log(LogType.Info, 'trial_keeper_version is {0}'.format(trial_keeper_version))
training_service_version = regular.search(args.version).group('version') nni_manager_version = regular.search(args.nni_manager_version).group('version')
nni_log(LogType.Info, 'training_service_version is {0}'.format(training_service_version)) nni_log(LogType.Info, 'nni_manager_version is {0}'.format(nni_manager_version))
if trial_keeper_version != training_service_version: log_entry = {}
if trial_keeper_version != nni_manager_version:
nni_log(LogType.Error, 'Version does not match!') nni_log(LogType.Error, 'Version does not match!')
error_message = 'NNIManager version is {0}, TrialKeeper version is {1}, NNI version does not match!'.format(nni_manager_version, trial_keeper_version)
log_entry['tag'] = 'VCFail'
log_entry['msg'] = error_message
rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, False)
os._exit(1) os._exit(1)
else: else:
nni_log(LogType.Info, 'Version match!') nni_log(LogType.Info, 'Version match!')
log_entry['tag'] = 'VCSuccess'
rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, False)
except AttributeError as err: except AttributeError as err:
nni_log(LogType.Error, err) nni_log(LogType.Error, err)
...@@ -142,7 +152,7 @@ if __name__ == '__main__': ...@@ -142,7 +152,7 @@ if __name__ == '__main__':
PARSER.add_argument('--pai_user_name', type=str, help='the username of hdfs') PARSER.add_argument('--pai_user_name', type=str, help='the username of hdfs')
PARSER.add_argument('--nni_hdfs_exp_dir', type=str, help='nni experiment directory in hdfs') PARSER.add_argument('--nni_hdfs_exp_dir', type=str, help='nni experiment directory in hdfs')
PARSER.add_argument('--webhdfs_path', type=str, help='the webhdfs path used in webhdfs URL') PARSER.add_argument('--webhdfs_path', type=str, help='the webhdfs path used in webhdfs URL')
PARSER.add_argument('--version', type=str, help='the nni version transmitted from trainingService') PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager')
PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trialkeeper') PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trialkeeper')
args, unknown = PARSER.parse_known_args() args, unknown = PARSER.parse_known_args()
if args.trial_command is None: if args.trial_command is None:
......
...@@ -18,8 +18,12 @@ ...@@ -18,8 +18,12 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from .constants import API_ROOT_URL, BASE_URL, STDOUT_API, NNI_TRIAL_JOB_ID, NNI_EXP_ID from .constants import API_ROOT_URL, BASE_URL, STDOUT_API, NNI_TRIAL_JOB_ID, NNI_EXP_ID, VERSION_API
def gen_send_stdout_url(ip, port): def gen_send_stdout_url(ip, port):
'''Generate send stdout url''' '''Generate send stdout url'''
return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, STDOUT_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID) return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, STDOUT_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID)
\ No newline at end of file
def gen_send_version_url(ip, port):
'''Generate send error url'''
return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, VERSION_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment