"src/sdk/vscode:/vscode.git/clone" did not exist on "41e587038a32db002178329a64224ea6a5b2a5f9"
Unverified Commit bee8f84e authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #174 from microsoft/master

merge master from Microsoft
parents c5acd8c2 252d35e0
# Metis Tuner
## Metis Tuner
大多数调参工具仅仅预测最优配置,而 [Metis](https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/) 的优势在于有两个输出:(a) 最优配置的当前预测结果, 以及 (b) 下一次 Trial 的建议。 不再需要随机猜测!
大多数工具假设训练集没有噪声数据,但 Metis 会知道是否需要对某个超参重新采样。
大多数工具都有着重于在已有结果上继续发展的问题,而 Metis 的搜索策略可以在探索,发展和重新采样(可选)中进行平衡。
Metis 属于基于序列的贝叶斯优化 (SMBO) 的类别,它也基于贝叶斯优化框架。 为了对超参-性能空间建模,Metis 同时使用了高斯过程(Gaussian Process)和高斯混合模型(GMM)。 由于每次 Trial 都可能有很高的时间成本,Metis 大量使用了已有模型来进行推理计算。 在每次迭代中,Metis 执行两个任务:
在高斯过程空间中找到全局最优点。 这一点表示了最佳配置。
它会标识出下一个超参的候选项。 这是通过对隐含信息的探索、挖掘和重采样来实现的。
注意,搜索空间仅支持 `choice`, `quniform`, `uniform``randint`
更多详情,参考论文:https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
\ No newline at end of file
...@@ -21,21 +21,21 @@ ...@@ -21,21 +21,21 @@
smac_tuner.py smac_tuner.py
""" """
from nni.tuner import Tuner
from nni.utils import OptimizeMode, extract_scalar_reward
import sys import sys
import logging import logging
import numpy as np import numpy as np
import json_tricks
from enum import Enum, unique from nni.tuner import Tuner
from .convert_ss_to_scenario import generate_scenario from nni.utils import OptimizeMode, extract_scalar_reward
from smac.utils.io.cmd_reader import CMDReader from smac.utils.io.cmd_reader import CMDReader
from smac.scenario.scenario import Scenario from smac.scenario.scenario import Scenario
from smac.facade.smac_facade import SMAC from smac.facade.smac_facade import SMAC
from smac.facade.roar_facade import ROAR from smac.facade.roar_facade import ROAR
from smac.facade.epils_facade import EPILS from smac.facade.epils_facade import EPILS
from ConfigSpaceNNI import Configuration
from .convert_ss_to_scenario import generate_scenario
class SMACTuner(Tuner): class SMACTuner(Tuner):
...@@ -57,6 +57,7 @@ class SMACTuner(Tuner): ...@@ -57,6 +57,7 @@ class SMACTuner(Tuner):
self.update_ss_done = False self.update_ss_done = False
self.loguniform_key = set() self.loguniform_key = set()
self.categorical_dict = {} self.categorical_dict = {}
self.cs = None
def _main_cli(self): def _main_cli(self):
"""Main function of SMAC for CLI interface """Main function of SMAC for CLI interface
...@@ -66,7 +67,7 @@ class SMACTuner(Tuner): ...@@ -66,7 +67,7 @@ class SMACTuner(Tuner):
instance instance
optimizer optimizer
""" """
self.logger.info("SMAC call: %s" % (" ".join(sys.argv))) self.logger.info("SMAC call: %s", " ".join(sys.argv))
cmd_reader = CMDReader() cmd_reader = CMDReader()
args, _ = cmd_reader.read_cmd() args, _ = cmd_reader.read_cmd()
...@@ -95,6 +96,7 @@ class SMACTuner(Tuner): ...@@ -95,6 +96,7 @@ class SMACTuner(Tuner):
# Create scenario-object # Create scenario-object
scen = Scenario(args.scenario_file, []) scen = Scenario(args.scenario_file, [])
self.cs = scen.cs
if args.mode == "SMAC": if args.mode == "SMAC":
optimizer = SMAC( optimizer = SMAC(
...@@ -258,4 +260,45 @@ class SMACTuner(Tuner): ...@@ -258,4 +260,45 @@ class SMACTuner(Tuner):
return params return params
def import_data(self, data): def import_data(self, data):
pass """Import additional data for tuning
Parameters
----------
data:
a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
"""
_completed_num = 0
for trial_info in data:
self.logger.info("Importing data, current processing progress %s / %s", _completed_num, len(data))
# simply validate data format
assert "parameter" in trial_info
_params = trial_info["parameter"]
assert "value" in trial_info
_value = trial_info['value']
if not _value:
self.logger.info("Useless trial data, value is %s, skip this trial data.", _value)
continue
# convert the keys in loguniform and categorical types
valid_entry = True
for key, value in _params.items():
if key in self.loguniform_key:
_params[key] = np.log(value)
elif key in self.categorical_dict:
if value in self.categorical_dict[key]:
_params[key] = self.categorical_dict[key].index(value)
else:
self.logger.info("The value %s of key %s is not in search space.", str(value), key)
valid_entry = False
break
if not valid_entry:
continue
# start import this data entry
_completed_num += 1
config = Configuration(self.cs, values=_params)
if self.optimize_mode is OptimizeMode.Maximize:
_value = -_value
if self.first_one:
self.smbo_solver.nni_smac_receive_first_run(config, _value)
self.first_one = False
else:
self.smbo_solver.nni_smac_receive_runs(config, _value)
self.logger.info("Successfully import data to smac tuner, total data: %d, imported data: %d.", len(data), _completed_num)
...@@ -28,12 +28,11 @@ class IntermediateVal extends React.Component<IntermediateValProps, {}> { ...@@ -28,12 +28,11 @@ class IntermediateVal extends React.Component<IntermediateValProps, {}> {
if (wei > 6) { if (wei > 6) {
result = `${lastVal.toFixed(6)}`; result = `${lastVal.toFixed(6)}`;
} }
if (status === 'SUCCEEDED') { }
result = `${lastVal.toFixed(6)} (FINAL)`; if (status === 'SUCCEEDED') {
} else { result = `${result} (FINAL)`;
result = `${lastVal.toFixed(6)} (LATEST)`; } else {
} result = `${result} (LATEST)`;
} }
} else { } else {
result = '--'; result = '--';
......
...@@ -3,7 +3,7 @@ experimentName: default_test ...@@ -3,7 +3,7 @@ experimentName: default_test
maxExecDuration: 5m maxExecDuration: 5m
maxTrialNum: 4 maxTrialNum: 4
trialConcurrency: 2 trialConcurrency: 2
searchSpacePath: ../../../examples/trials/mnist-cascading-search-space/search_space.json searchSpacePath: ../../../examples/trials/mnist-nested-search-space/search_space.json
tuner: tuner:
#choice: TPE, Random, Anneal, Evolution #choice: TPE, Random, Anneal, Evolution
...@@ -13,7 +13,7 @@ assessor: ...@@ -13,7 +13,7 @@ assessor:
classArgs: classArgs:
optimize_mode: maximize optimize_mode: maximize
trial: trial:
codeDir: ../../../examples/trials/mnist-cascading-search-space codeDir: ../../../examples/trials/mnist-nested-search-space
command: python3 mnist.py --batch_num 100 command: python3 mnist.py --batch_num 100
gpuNum: 0 gpuNum: 0
......
...@@ -63,7 +63,9 @@ common_schema = { ...@@ -63,7 +63,9 @@ common_schema = {
Optional('advisor'): dict, Optional('advisor'): dict,
Optional('assessor'): dict, Optional('assessor'): dict,
Optional('localConfig'): { Optional('localConfig'): {
Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!') Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
Optional('useActiveGpu'): setType('useActiveGpu', bool)
} }
} }
tuner_schema_dict = { tuner_schema_dict = {
...@@ -310,26 +312,30 @@ frameworkcontroller_config_schema = { ...@@ -310,26 +312,30 @@ frameworkcontroller_config_schema = {
}) })
} }
machine_list_schima = { machine_list_schema = {
Optional('machineList'):[Or({ Optional('machineList'):[Or({
'ip': setType('ip', str), 'ip': setType('ip', str),
Optional('port'): setNumberRange('port', int, 1, 65535), Optional('port'): setNumberRange('port', int, 1, 65535),
'username': setType('username', str), 'username': setType('username', str),
'passwd': setType('passwd', str), 'passwd': setType('passwd', str),
Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!') Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
Optional('useActiveGpu'): setType('useActiveGpu', bool)
},{ },{
'ip': setType('ip', str), 'ip': setType('ip', str),
Optional('port'): setNumberRange('port', int, 1, 65535), Optional('port'): setNumberRange('port', int, 1, 65535),
'username': setType('username', str), 'username': setType('username', str),
'sshKeyPath': setPathCheck('sshKeyPath'), 'sshKeyPath': setPathCheck('sshKeyPath'),
Optional('passphrase'): setType('passphrase', str), Optional('passphrase'): setType('passphrase', str),
Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!') Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
Optional('useActiveGpu'): setType('useActiveGpu', bool)
})] })]
} }
LOCAL_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema}) LOCAL_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema})
REMOTE_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema, **machine_list_schima}) REMOTE_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema, **machine_list_schema})
PAI_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_config_schema}) PAI_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_config_schema})
......
...@@ -23,6 +23,7 @@ import os ...@@ -23,6 +23,7 @@ import os
import json import json
import shutil import shutil
from .constants import NNICTL_HOME_DIR from .constants import NNICTL_HOME_DIR
from .common_utils import print_error
class Config: class Config:
'''a util class to load and save config''' '''a util class to load and save config'''
...@@ -119,4 +120,26 @@ class Experiments: ...@@ -119,4 +120,26 @@ class Experiments:
return json.load(file) return json.load(file)
except ValueError: except ValueError:
return {} return {}
return {} return {}
\ No newline at end of file
class HDFSConfig:
'''manage hdfs configuration'''
def __init__(self):
os.makedirs(NNICTL_HOME_DIR, exist_ok=True)
self.hdfs_config_file = os.path.join(NNICTL_HOME_DIR, '.hdfs')
def get_config(self):
if os.path.exists(self.hdfs_config_file):
try:
with open(self.hdfs_config_file, 'r') as file:
return json.load(file)
except Exception as exception:
print_error(exception)
return None
else:
return None
def set_config(self, host, user_name):
with open(self.hdfs_config_file, 'w') as file:
json.dump({'host':host, 'userName': user_name}, file)
...@@ -86,12 +86,13 @@ TUNERS_SUPPORTING_IMPORT_DATA = { ...@@ -86,12 +86,13 @@ TUNERS_SUPPORTING_IMPORT_DATA = {
'Anneal', 'Anneal',
'GridSearch', 'GridSearch',
'MetisTuner', 'MetisTuner',
'BOHB' 'BOHB',
'SMAC',
'BatchTuner'
} }
TUNERS_NO_NEED_TO_IMPORT_DATA = { TUNERS_NO_NEED_TO_IMPORT_DATA = {
'Random', 'Random',
'Batch_tuner',
'Hyperband' 'Hyperband'
} }
......
...@@ -160,9 +160,13 @@ def set_local_config(experiment_config, port, config_file_name): ...@@ -160,9 +160,13 @@ def set_local_config(experiment_config, port, config_file_name):
request_data = dict() request_data = dict()
if experiment_config.get('localConfig'): if experiment_config.get('localConfig'):
request_data['local_config'] = experiment_config['localConfig'] request_data['local_config'] = experiment_config['localConfig']
if request_data['local_config'] and request_data['local_config'].get('gpuIndices') \ if request_data['local_config']:
and isinstance(request_data['local_config'].get('gpuIndices'), int): if request_data['local_config'].get('gpuIndices') and isinstance(request_data['local_config'].get('gpuIndices'), int):
request_data['local_config']['gpuIndices'] = str(request_data['local_config'].get('gpuIndices')) request_data['local_config']['gpuIndices'] = str(request_data['local_config'].get('gpuIndices'))
if request_data['local_config'].get('maxTrialNumOnEachGpu'):
request_data['local_config']['maxTrialNumOnEachGpu'] = request_data['local_config'].get('maxTrialNumOnEachGpu')
if request_data['local_config'].get('useActiveGpu'):
request_data['local_config']['useActiveGpu'] = request_data['local_config'].get('useActiveGpu')
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT) response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT)
err_message = '' err_message = ''
if not response or not check_response(response): if not response or not check_response(response):
...@@ -343,6 +347,13 @@ def set_experiment(experiment_config, mode, port, config_file_name): ...@@ -343,6 +347,13 @@ def set_experiment(experiment_config, mode, port, config_file_name):
def launch_experiment(args, experiment_config, mode, config_file_name, experiment_id=None): def launch_experiment(args, experiment_config, mode, config_file_name, experiment_id=None):
'''follow steps to start rest server and start experiment''' '''follow steps to start rest server and start experiment'''
nni_config = Config(config_file_name) nni_config = Config(config_file_name)
# check execution policy in powershell
if sys.platform == 'win32':
execution_policy = check_output(['powershell.exe','Get-ExecutionPolicy']).decode('ascii').strip()
if execution_policy == 'Restricted':
print_error('PowerShell execution policy error, please run PowerShell as administrator with this command first:\r\n'\
+ '\'Set-ExecutionPolicy -ExecutionPolicy Unrestricted\'')
exit(1)
# check packages for tuner # check packages for tuner
package_name, module_name = None, None package_name, module_name = None, None
if experiment_config.get('tuner') and experiment_config['tuner'].get('builtinTunerName'): if experiment_config.get('tuner') and experiment_config['tuner'].get('builtinTunerName'):
......
...@@ -194,6 +194,15 @@ def parse_args(): ...@@ -194,6 +194,15 @@ def parse_args():
'the unit is second') 'the unit is second')
parser_top.set_defaults(func=monitor_experiment) parser_top.set_defaults(func=monitor_experiment)
parser_hdfs = subparsers.add_parser('hdfs', help='monitor hdfs files')
parser_hdfs_subparsers = parser_hdfs.add_subparsers()
parser_hdfs_set = parser_hdfs_subparsers.add_parser('set', help='set the host and userName of hdfs')
parser_hdfs_set.add_argument('--host', required=True, dest='host', help='the host of hdfs')
parser_hdfs_set.add_argument('--user_name', required=True, dest='user_name', help='the userName of hdfs')
parser_hdfs_set.set_defaults(func=hdfs_set)
parser_hdfs_list = parser_hdfs_subparsers.add_parser('clean', help='clean hdfs files')
parser_hdfs_list.set_defaults(func=hdfs_clean)
args = parser.parse_args() args = parser.parse_args()
args.func(args) args.func(args)
......
...@@ -26,8 +26,9 @@ import datetime ...@@ -26,8 +26,9 @@ import datetime
import time import time
from subprocess import call, check_output from subprocess import call, check_output
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
from .config_utils import Config, Experiments from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url
from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url from pyhdfs import HdfsClient, HdfsFileNotFoundException
from .config_utils import Config, Experiments, HDFSConfig
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT
from .common_utils import print_normal, print_error, print_warning, detect_process from .common_utils import print_normal, print_error, print_warning, detect_process
...@@ -450,30 +451,9 @@ def monitor_experiment(args): ...@@ -450,30 +451,9 @@ def monitor_experiment(args):
print_error(exception) print_error(exception)
exit(1) exit(1)
def parse_trial_data(content):
"""output: List[Dict]"""
trial_records = []
for trial_data in content:
for phase_i in range(len(trial_data['hyperParameters'])):
hparam = json.loads(trial_data['hyperParameters'][phase_i])['parameters']
hparam['id'] = trial_data['id']
if 'finalMetricData' in trial_data.keys() and phase_i < len(trial_data['finalMetricData']):
reward = json.loads(trial_data['finalMetricData'][phase_i]['data'])
if isinstance(reward, (float, int)):
dict_tmp = {**hparam, **{'reward': reward}}
elif isinstance(reward, dict):
dict_tmp = {**hparam, **reward}
else:
raise ValueError("Invalid finalMetricsData format: {}/{}".format(type(reward), reward))
else:
dict_tmp = hparam
trial_records.append(dict_tmp)
return trial_records
def export_trials_data(args): def export_trials_data(args):
"""export experiment metadata to csv '''export experiment metadata to csv
""" '''
nni_config = Config(get_config_filename(args)) nni_config = Config(get_config_filename(args))
rest_port = nni_config.get_config('restServerPort') rest_port = nni_config.get_config('restServerPort')
rest_pid = nni_config.get_config('restServerPid') rest_pid = nni_config.get_config('restServerPid')
...@@ -482,26 +462,60 @@ def export_trials_data(args): ...@@ -482,26 +462,60 @@ def export_trials_data(args):
return return
running, response = check_rest_server_quick(rest_port) running, response = check_rest_server_quick(rest_port)
if running: if running:
response = rest_get(trial_jobs_url(rest_port), 20) response = rest_get(export_data_url(rest_port), 20)
if response is not None and check_response(response): if response is not None and check_response(response):
content = json.loads(response.text)
# dframe = pd.DataFrame.from_records([parse_trial_data(t_data) for t_data in content])
# dframe.to_csv(args.csv_path, sep='\t')
records = parse_trial_data(content)
if args.type == 'json': if args.type == 'json':
json_records = [] with open(args.path, 'w') as file:
for trial in records: file.write(response.text)
value = trial.pop('reward', None) elif args.type == 'csv':
trial_id = trial.pop('id', None) content = json.loads(response.text)
json_records.append({'parameter': trial, 'value': value, 'id': trial_id}) trial_records = []
with open(args.path, 'w') as file: for record in content:
if args.type == 'csv': if not isinstance(record['value'], (float, int)):
writer = csv.DictWriter(file, set.union(*[set(r.keys()) for r in records])) formated_record = {**record['parameter'], **record['value'], **{'id': record['id']}}
else:
formated_record = {**record['parameter'], **{'reward': record['value'], 'id': record['id']}}
trial_records.append(formated_record)
with open(args.path, 'w') as file:
writer = csv.DictWriter(file, set.union(*[set(r.keys()) for r in trial_records]))
writer.writeheader() writer.writeheader()
writer.writerows(records) writer.writerows(trial_records)
else: else:
json.dump(json_records, file) print_error('Unknown type: %s' % args.type)
exit(1)
else: else:
print_error('Export failed...') print_error('Export failed...')
else: else:
print_error('Restful server is not Running') print_error('Restful server is not Running')
def hdfs_set(args):
hdfsConfig = HDFSConfig()
hdfsConfig.set_config(args.host, args.user_name)
print_normal('HDFS account update success!')
def hdfs_clean(args):
hdfsConfig = HDFSConfig()
if not hdfsConfig.get_config():
print_error('Please use \'nnictl hdfs set\' command to set hdfs account first!')
exit(1)
host = hdfsConfig.get_config().get('host')
user_name = hdfsConfig.get_config().get('userName')
hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5)
root_path = os.path.join('/', user_name, 'nni', 'experiments')
while True:
inputs = input('INFO: clean up all files in {0}, do you want to continue?[Y/N]:'.format(root_path))
if inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
elif inputs.lower() in ['n', 'no']:
exit(0)
else:
break
path_list = hdfs_client.listdir(root_path)
for path in path_list:
full_path = os.path.join(root_path, path)
print_normal('deleting {0}'.format(full_path))
if hdfs_client.delete(full_path, recursive=True):
print_normal('delete success!')
else:
print_normal('delete failed!')
print_normal('DONE')
...@@ -35,6 +35,8 @@ CHECK_STATUS_API = '/check-status' ...@@ -35,6 +35,8 @@ CHECK_STATUS_API = '/check-status'
TRIAL_JOBS_API = '/trial-jobs' TRIAL_JOBS_API = '/trial-jobs'
EXPORT_DATA_API = '/export-data'
TENSORBOARD_API = '/tensorboard' TENSORBOARD_API = '/tensorboard'
...@@ -68,6 +70,11 @@ def trial_job_id_url(port, job_id): ...@@ -68,6 +70,11 @@ def trial_job_id_url(port, job_id):
return '{0}:{1}{2}{3}/:{4}'.format(BASE_URL, port, API_ROOT_URL, TRIAL_JOBS_API, job_id) return '{0}:{1}{2}{3}/:{4}'.format(BASE_URL, port, API_ROOT_URL, TRIAL_JOBS_API, job_id)
def export_data_url(port):
'''get export_data url'''
return '{0}:{1}{2}{3}'.format(BASE_URL, port, API_ROOT_URL, EXPORT_DATA_API)
def tensorboard_url(port): def tensorboard_url(port):
'''get tensorboard url''' '''get tensorboard url'''
return '{0}:{1}{2}{3}'.format(BASE_URL, port, API_ROOT_URL, TENSORBOARD_API) return '{0}:{1}{2}{3}'.format(BASE_URL, port, API_ROOT_URL, TENSORBOARD_API)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment