Commit 05913424 authored by suiguoxin's avatar suiguoxin
Browse files

Merge branch 'master' into quniform-tuners

parents e3c8552f 1dab3118
......@@ -92,6 +92,16 @@ tuner_schema_dict = {
Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
},
'TPE': {
'builtinTunerName': 'TPE',
'classArgs': {
Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'),
Optional('parallel_optimize'): setType('parallel_optimize', bool),
Optional('constant_liar_type'): setChoice('constant_liar_type', 'min', 'max', 'mean')
},
Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
},
'NetworkMorphism': {
'builtinTunerName': 'NetworkMorphism',
'classArgs': {
......@@ -210,7 +220,7 @@ common_trial_schema = {
'trial':{
'command': setType('command', str),
'codeDir': setPathCheck('codeDir'),
'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
Optional('nasMode'): setChoice('classic_mode', 'enas_mode', 'oneshot_mode')
}
}
......@@ -223,6 +233,8 @@ pai_trial_schema = {
'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int),
'image': setType('image', str),
Optional('authFile'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: authFile format error, authFile format is hdfs://xxx.xxx.xxx.xxx:xxx'),
Optional('shmMB'): setType('shmMB', int),
Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
......@@ -251,7 +263,8 @@ kubeflow_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int),
'image': setType('image', str)
'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
},
Optional('master'): {
'replicas': setType('replicas', int),
......@@ -259,7 +272,8 @@ kubeflow_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int),
'image': setType('image', str)
'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
},
Optional('worker'):{
'replicas': setType('replicas', int),
......@@ -267,7 +281,8 @@ kubeflow_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int),
'image': setType('image', str)
'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
}
}
}
......@@ -314,7 +329,8 @@ frameworkcontroller_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int),
'image': setType('image', str)
'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
}]
}
}
......
......@@ -360,9 +360,14 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
module_name = AdvisorModuleName.get(package_name)
if package_name and module_name:
try:
check_call([sys.executable, '-c', 'import %s'%(module_name)], stdout=PIPE, stderr=PIPE)
stdout_full_path, stderr_full_path = get_log_path(config_file_name)
with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file:
check_call([sys.executable, '-c', 'import %s'%(module_name)], stdout=stdout_file, stderr=stderr_file)
except CalledProcessError as e:
print_error('%s should be installed through \'nnictl package install --name %s\''%(package_name, package_name))
print_error('some errors happen when import package %s.' %(package_name))
print_log_content(config_file_name)
if package_name in PACKAGE_REQUIREMENTS:
print_error('If %s is not installed, it should be installed through \'nnictl package install --name %s\''%(package_name, package_name))
exit(1)
log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
......
......@@ -253,6 +253,14 @@ def validate_pai_trial_conifg(experiment_config):
experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
print_error('shmMB should be no more than memoryMB!')
exit(1)
#backward compatibility
warning_information = '{0} is not supported in NNI anymore, please remove the field in config file!\
please refer https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md#run-an-experiment\
for the practices of how to get data and output model in trial code'
if experiment_config.get('trial').get('dataDir'):
print_warning(warning_information.format('dataDir'))
if experiment_config.get('trial').get('outputDir'):
print_warning(warning_information.format('outputDir'))
def validate_all_content(experiment_config, config_path):
'''Validate whether experiment_config is valid'''
......
......@@ -90,6 +90,7 @@ def parse_args():
#parse stop command
parser_stop = subparsers.add_parser('stop', help='stop the experiment')
parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments')
parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server')
parser_stop.set_defaults(func=stop_experiment)
#parse trial command
......
......@@ -118,12 +118,14 @@ def check_experiment_id(args, update=True):
def parse_ids(args):
'''Parse the arguments for nnictl stop
1.If there is an id specified, return the corresponding id
2.If there is no id specified, and there is an experiment running, return the id, or return Error
3.If the id matches an experiment, nnictl will return the id.
4.If the id ends with *, nnictl will match all ids matchs the regular
5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id
6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information
1.If port is provided and id is not specified, return the id who owns the port
2.If both port and id are provided, return the id if it owns the port, otherwise fail
3.If there is an id specified, return the corresponding id
4.If there is no id specified, and there is an experiment running, return the id, or return Error
5.If the id matches an experiment, nnictl will return the id.
6.If the id ends with *, nnictl will match all ids matchs the regular
7.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id
8.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information
'''
update_experiment()
experiment_config = Experiments()
......@@ -140,7 +142,14 @@ def parse_ids(args):
elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key)
if not args.id:
if args.port is not None:
for key in running_experiment_list:
if str(experiment_dict[key]['port']) == args.port:
result_list.append(key)
if args.id and result_list and args.id != result_list[0]:
print_error('Experiment id and resful server port not match')
exit(1)
elif not args.id:
if len(running_experiment_list) > 1:
print_error('There are multiple experiments, please set the experiment id...')
experiment_information = ""
......@@ -166,8 +175,8 @@ def parse_ids(args):
if len(result_list) > 1:
print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) )
return None
if not result_list and args.id and args.id != 'all':
print_error('There are no experiments matched, please set correct experiment id...')
if not result_list and ((args.id and args.id != 'all') or args.port):
print_error('There are no experiments matched, please set correct experiment id or restful server port')
elif not result_list:
print_error('There is no experiment running...')
return result_list
......@@ -511,9 +520,9 @@ def platform_clean(args):
if platform not in ['remote', 'pai']:
print_normal('platform {0} not supported.'.format(platform))
exit(0)
update_experiment()
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
update_experiment()
id_list = list(experiment_dict.keys())
dir_list = get_platform_dir(config_content)
if not dir_list:
......@@ -544,12 +553,12 @@ def platform_clean(args):
def experiment_list(args):
'''get the information of all experiments'''
update_experiment()
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict:
print_normal('Cannot find experiments.')
exit(1)
update_experiment()
experiment_id_list = []
if args.all:
for key in experiment_dict.keys():
......@@ -586,12 +595,12 @@ def get_time_interval(time1, time2):
def show_experiment_info():
'''show experiment information in monitor'''
update_experiment()
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if not experiment_dict:
print('There is no experiment running...')
exit(1)
update_experiment()
experiment_id_list = []
for key in experiment_dict.keys():
if experiment_dict[key]['status'] != 'STOPPED':
......@@ -665,4 +674,4 @@ def export_trials_data(args):
else:
print_error('Export failed...')
else:
print_error('Restful server is not Running')
\ No newline at end of file
print_error('Restful server is not Running')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment