Commit df6145a2 authored by Yuge Zhang's avatar Yuge Zhang
Browse files

Merge branch 'master' of https://github.com/microsoft/nni into dev-retiarii

parents 0f0c6288 f8424a9f
from .ppo_tuner import PPOTuner
from .ppo_tuner import PPOTuner, PPOClassArgsValidator
from .regularized_evolution_tuner import RegularizedEvolutionTuner
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .smac_tuner import SMACTuner
from .smac_tuner import SMACTuner, SMACClassArgsValidator
git+https://github.com/QuanluZhang/ConfigSpace.git
git+https://github.com/QuanluZhang/SMAC3.git
......@@ -2,4 +2,4 @@
# Licensed under the MIT license.
from .mutator import RegularizedDartsMutator, RegularizedMutatorParallel, DartsDiscreteMutator
from .trainer import CdartsTrainer
\ No newline at end of file
from .trainer import CdartsTrainer
from .mutator import RandomMutator
\ No newline at end of file
from .mutator import RandomMutator
......@@ -662,7 +662,7 @@ class QuantGrad(torch.autograd.Function):
if quant_type == QuantType.QUANT_INPUT:
output = wrapper.quantizer.quantize_input(tensor, wrapper, **kwargs)
elif quant_type == QuantType.QUANT_WEIGHT:
output = wrapper.quantizer.quantize_weight(wrapper, **kwargs)
output = wrapper.quantizer.quantize_weight(wrapper, **kwargs)
elif quant_type == QuantType.QUANT_OUTPUT:
output = wrapper.quantizer.quantize_output(tensor, wrapper, **kwargs)
else:
......
......@@ -87,7 +87,7 @@ class ConfigBase:
"""
return dataclasses.asdict(
self.canonical(),
dict_factory = lambda items: dict((util.camel_case(k), v) for k, v in items if v is not None)
dict_factory=lambda items: dict((util.camel_case(k), v) for k, v in items if v is not None)
)
def canonical(self: T) -> T:
......
......@@ -32,7 +32,7 @@ def start_experiment(config: ExperimentConfig, port: int, debug: bool) -> Tuple[
exp_id = management.generate_experiment_id()
try:
_logger.info(f'Creating experiment {colorama.Fore.CYAN}{exp_id}')
_logger.info('Creating experiment %s%s', colorama.Fore.CYAN, exp_id)
pipe = Pipe(exp_id)
proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
_logger.info('Connecting IPC pipe...')
......
from .base_mutator import BaseMutator
from .base_trainer import BaseTrainer
from .fixed import apply_fixed_architecture
from .mutables import Mutable, LayerChoice, InputChoice
from .mutator import Mutator
from .trainer import Trainer
......@@ -12,7 +12,8 @@ _trial_env_var_names = [
'NNI_SYS_DIR',
'NNI_OUTPUT_DIR',
'NNI_TRIAL_SEQ_ID',
'MULTI_PHASE'
'MULTI_PHASE',
'REUSE_MODE'
]
_dispatcher_env_var_names = [
......
......@@ -31,7 +31,7 @@ def init_logger() -> None:
if trial_platform == 'unittest':
return
if trial_platform:
if trial_platform and not trial_env_vars.REUSE_MODE:
_init_logger_trial()
return
......
......@@ -9,7 +9,7 @@ if trial_env_vars.NNI_PLATFORM is None:
from .standalone import *
elif trial_env_vars.NNI_PLATFORM == 'unittest':
from .test import *
elif trial_env_vars.NNI_PLATFORM in ('adl', 'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'):
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'):
from .local import *
else:
raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM)
......@@ -19,6 +19,7 @@ _outputdir = trial_env_vars.NNI_OUTPUT_DIR
if not os.path.exists(_outputdir):
os.makedirs(_outputdir)
_reuse_mode = trial_env_vars.REUSE_MODE
_nni_platform = trial_env_vars.NNI_PLATFORM
_multiphase = trial_env_vars.MULTI_PHASE
......@@ -58,7 +59,7 @@ def get_next_parameter():
return params
def send_metric(string):
if _nni_platform != 'local':
if _nni_platform != 'local' or _reuse_mode in ('true', 'True'):
assert len(string) < 1000000, 'Metric too long'
print("NNISDK_MEb'%s'" % (string), flush=True)
else:
......
......@@ -124,7 +124,7 @@ common_schema = {
Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
'trainingServicePlatform': setChoice(
'trainingServicePlatform', 'adl', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'),
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'),
Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'),
Optional('multiPhase'): setType('multiPhase', bool),
Optional('multiThread'): setType('multiThread', bool),
......@@ -208,7 +208,7 @@ pai_trial_schema = {
}
pai_config_schema = {
'paiConfig': {
Optional('paiConfig'): {
'userName': setType('userName', str),
Or('passWord', 'token', only_one=True): str,
'host': setType('host', str),
......@@ -252,7 +252,7 @@ aml_trial_schema = {
}
aml_config_schema = {
'amlConfig': {
Optional('amlConfig'): {
'subscriptionId': setType('subscriptionId', str),
'resourceGroup': setType('resourceGroup', str),
'workspaceName': setType('workspaceName', str),
......@@ -262,6 +262,29 @@ aml_config_schema = {
}
}
heterogeneous_trial_schema = {
'trial': {
'codeDir': setPathCheck('codeDir'),
Optional('nniManagerNFSMountPath'): setPathCheck('nniManagerNFSMountPath'),
Optional('containerNFSMountPath'): setType('containerNFSMountPath', str),
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
'command': setType('command', str),
Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999),
Optional('memoryMB'): setType('memoryMB', int),
Optional('image'): setType('image', str),
Optional('virtualCluster'): setType('virtualCluster', str),
Optional('paiStorageConfigName'): setType('paiStorageConfigName', str),
Optional('paiConfigPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'paiConfigPath')
}
}
heterogeneous_config_schema = {
'heterogeneousConfig': {
'trainingServicePlatforms': ['local', 'remote', 'pai', 'aml']
}
}
adl_trial_schema = {
'trial':{
'codeDir': setType('codeDir', str),
......@@ -404,7 +427,7 @@ remote_config_schema = {
}
machine_list_schema = {
'machineList': [Or(
Optional('machineList'): [Or(
{
'ip': setType('ip', str),
Optional('port'): setNumberRange('port', int, 1, 65535),
......@@ -438,6 +461,8 @@ training_service_schema_dict = {
'frameworkcontroller': Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}),
'aml': Schema({**common_schema, **aml_trial_schema, **aml_config_schema}),
'dlts': Schema({**common_schema, **dlts_trial_schema, **dlts_config_schema}),
'heterogeneous': Schema({**common_schema, **heterogeneous_trial_schema, **heterogeneous_config_schema, **machine_list_schema,
**pai_config_schema, **aml_config_schema, **remote_config_schema}),
}
......@@ -454,6 +479,7 @@ class NNIConfigSchema:
self.validate_pai_trial_conifg(experiment_config)
self.validate_kubeflow_operators(experiment_config)
self.validate_eth0_device(experiment_config)
self.validate_heterogeneous_platforms(experiment_config)
def validate_tuner_adivosr_assessor(self, experiment_config):
if experiment_config.get('advisor'):
......@@ -563,3 +589,16 @@ class NNIConfigSchema:
and not experiment_config.get('nniManagerIp') \
and 'eth0' not in netifaces.interfaces():
raise SchemaError('This machine does not contain eth0 network device, please set nniManagerIp in config file!')
def validate_heterogeneous_platforms(self, experiment_config):
required_config_name_map = {
'remote': 'machineList',
'aml': 'amlConfig',
'pai': 'paiConfig'
}
if experiment_config.get('trainingServicePlatform') == 'heterogeneous':
for platform in experiment_config['heterogeneousConfig']['trainingServicePlatforms']:
config_name = required_config_name_map.get(platform)
if config_name and not experiment_config.get(config_name):
raise SchemaError('Need to set {0} for {1} in heterogeneous mode!'.format(config_name, platform))
\ No newline at end of file
......@@ -85,7 +85,10 @@ class Experiments:
self.experiments = self.read_file()
if expId not in self.experiments:
return False
self.experiments[expId][key] = value
if value is None:
self.experiments[expId].pop(key, None)
else:
self.experiments[expId][key] = value
self.write_file()
return True
......
......@@ -118,13 +118,6 @@ def set_local_config(experiment_config, port, config_file_name):
request_data = dict()
if experiment_config.get('localConfig'):
request_data['local_config'] = experiment_config['localConfig']
if request_data['local_config']:
if request_data['local_config'].get('gpuIndices') and isinstance(request_data['local_config'].get('gpuIndices'), int):
request_data['local_config']['gpuIndices'] = str(request_data['local_config'].get('gpuIndices'))
if request_data['local_config'].get('maxTrialNumOnEachGpu'):
request_data['local_config']['maxTrialNumOnEachGpu'] = request_data['local_config'].get('maxTrialNumOnEachGpu')
if request_data['local_config'].get('useActiveGpu'):
request_data['local_config']['useActiveGpu'] = request_data['local_config'].get('useActiveGpu')
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT)
err_message = ''
if not response or not check_response(response):
......@@ -306,6 +299,37 @@ def set_aml_config(experiment_config, port, config_file_name):
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message
def set_heterogeneous_config(experiment_config, port, config_file_name):
'''set heterogeneous configuration'''
heterogeneous_config_data = dict()
heterogeneous_config_data['heterogeneous_config'] = experiment_config['heterogeneousConfig']
platform_list = experiment_config['heterogeneousConfig']['trainingServicePlatforms']
for platform in platform_list:
if platform == 'aml':
heterogeneous_config_data['aml_config'] = experiment_config['amlConfig']
elif platform == 'remote':
if experiment_config.get('remoteConfig'):
heterogeneous_config_data['remote_config'] = experiment_config['remoteConfig']
heterogeneous_config_data['machine_list'] = experiment_config['machineList']
elif platform == 'local' and experiment_config.get('localConfig'):
heterogeneous_config_data['local_config'] = experiment_config['localConfig']
elif platform == 'pai':
heterogeneous_config_data['pai_config'] = experiment_config['paiConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(heterogeneous_config_data), REST_TIME_OUT)
err_message = None
if not response or not response.status_code == 200:
if response is not None:
err_message = response.text
_, stderr_full_path = get_log_path(config_file_name)
with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message
result, message = setNNIManagerIp(experiment_config, port, config_file_name)
if not result:
return result, message
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message
def set_experiment(experiment_config, mode, port, config_file_name):
'''Call startExperiment (rest POST /experiment) with yaml file content'''
request_data = dict()
......@@ -387,6 +411,21 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'aml_config', 'value': experiment_config['amlConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'heterogeneous':
request_data['clusterMetaData'].append(
{'key': 'heterogeneous_config', 'value': experiment_config['heterogeneousConfig']})
platform_list = experiment_config['heterogeneousConfig']['trainingServicePlatforms']
request_dict = {
'aml': {'key': 'aml_config', 'value': experiment_config.get('amlConfig')},
'remote': {'key': 'machine_list', 'value': experiment_config.get('machineList')},
'pai': {'key': 'pai_config', 'value': experiment_config.get('paiConfig')},
'local': {'key': 'local_config', 'value': experiment_config.get('localConfig')}
}
for platform in platform_list:
if request_dict.get(platform):
request_data['clusterMetaData'].append(request_dict[platform])
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True)
if check_response(response):
return response
......@@ -420,6 +459,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res
config_result, err_msg = set_dlts_config(experiment_config, port, config_file_name)
elif platform == 'aml':
config_result, err_msg = set_aml_config(experiment_config, port, config_file_name)
elif platform == 'heterogeneous':
config_result, err_msg = set_heterogeneous_config(experiment_config, port, config_file_name)
else:
raise Exception(ERROR_INFO % 'Unsupported platform!')
exit(1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment