Merge branch 'master' of https://github.com/microsoft/nni into dev-retiarii

df6145a2 · Yuge Zhang · 0f0c6288 · f8424a9f · 0f0c6288 · df6145a2
Commit df6145a2 authored Dec 16, 2020 by Yuge Zhang
20 changed files
--- a/nni/algorithms/hpo/pbt_tuner/__init__.py
+++ b/nni/algorithms/hpo/pbt_tuner/__init__.py
--- a/nni/algorithms/hpo/ppo_tuner/__init__.py
+++ b/nni/algorithms/hpo/ppo_tuner/__init__.py
-from .ppo_tuner import PPOTuner
+from .ppo_tuner import PPOTuner, PPOClassArgsValidator
--- a/nni/algorithms/hpo/ppo_tuner/requirements.txt
+++ b/nni/algorithms/hpo/ppo_tuner/requirements.txt
-enum34
-gym
--- a/nni/algorithms/hpo/regularized_evolution_tuner/regularized_evolution_tuner.py
+++ b/nni/algorithms/hpo/regularized_evolution_tuner/regularized_evolution_tuner.py
--- a/nni/algorithms/hpo/regularized_evolution_tuner/__init__.py
+++ b/nni/algorithms/hpo/regularized_evolution_tuner/__init__.py
-from .regularized_evolution_tuner import RegularizedEvolutionTuner
--- a/nni/algorithms/hpo/smac_tuner/__init__.py
+++ b/nni/algorithms/hpo/smac_tuner/__init__.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
-from .smac_tuner import SMACTuner
+from .smac_tuner import SMACTuner, SMACClassArgsValidator
--- a/nni/algorithms/hpo/smac_tuner/requirements.txt
+++ b/nni/algorithms/hpo/smac_tuner/requirements.txt
-git+https://github.com/QuanluZhang/ConfigSpace.git
-git+https://github.com/QuanluZhang/SMAC3.git
--- a/nni/algorithms/nas/pytorch/cdarts/__init__.py
+++ b/nni/algorithms/nas/pytorch/cdarts/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT license.
 from .mutator import RegularizedDartsMutator, RegularizedMutatorParallel, DartsDiscreteMutator
 from .trainer import CdartsTrainer
\ No newline at end of file
--- a/nni/algorithms/nas/pytorch/random/__init__.py
+++ b/nni/algorithms/nas/pytorch/random/__init__.py
 from .mutator import RandomMutator
\ No newline at end of file
--- a/nni/compression/pytorch/compressor.py
+++ b/nni/compression/pytorch/compressor.py
@@ -662,7 +662,7 @@ class QuantGrad(torch.autograd.Function):
        if quant_type == QuantType.QUANT_INPUT:
            output = wrapper.quantizer.quantize_input(tensor, wrapper, **kwargs)
        elif quant_type == QuantType.QUANT_WEIGHT:
-            output =  wrapper.quantizer.quantize_weight(wrapper, **kwargs)
+            output = wrapper.quantizer.quantize_weight(wrapper, **kwargs)
        elif quant_type == QuantType.QUANT_OUTPUT:
            output = wrapper.quantizer.quantize_output(tensor, wrapper, **kwargs)
        else:

--- a/nni/experiment/config/base.py
+++ b/nni/experiment/config/base.py
@@ -87,7 +87,7 @@ class ConfigBase:
        """
        return dataclasses.asdict(
            self.canonical(),
-            dict_factory = lambda items: dict((util.camel_case(k), v) for k, v in items if v is not None)
+            dict_factory=lambda items: dict((util.camel_case(k), v) for k, v in items if v is not None)
        )
    def canonical(self: T) -> T:

--- a/nni/experiment/launcher.py
+++ b/nni/experiment/launcher.py
@@ -32,7 +32,7 @@ def start_experiment(config: ExperimentConfig, port: int, debug: bool) -> Tuple[
    exp_id = management.generate_experiment_id()
    try:
-        _logger.info(f'Creating experiment {colorama.Fore.CYAN}{exp_id}')
+        _logger.info('Creating experiment %s%s', colorama.Fore.CYAN, exp_id)
        pipe = Pipe(exp_id)
        proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
        _logger.info('Connecting IPC pipe...')

--- a/nni/nas/pytorch/__init__.py
+++ b/nni/nas/pytorch/__init__.py
+from .base_mutator import BaseMutator
+from .base_trainer import BaseTrainer
+from .fixed import apply_fixed_architecture
+from .mutables import Mutable, LayerChoice, InputChoice
+from .mutator import Mutator
+from .trainer import Trainer
--- a/nni/runtime/env_vars.py
+++ b/nni/runtime/env_vars.py
@@ -12,7 +12,8 @@ _trial_env_var_names = [
    'NNI_SYS_DIR',
    'NNI_OUTPUT_DIR',
    'NNI_TRIAL_SEQ_ID',
-    'MULTI_PHASE'
+    'MULTI_PHASE',
+    'REUSE_MODE'
 ]
 _dispatcher_env_var_names = [

--- a/nni/runtime/log.py
+++ b/nni/runtime/log.py
@@ -31,7 +31,7 @@ def init_logger() -> None:
    if trial_platform == 'unittest':
        return
-    if trial_platform:
+    if trial_platform and not trial_env_vars.REUSE_MODE:
        _init_logger_trial()
        return

--- a/nni/runtime/platform/__init__.py
+++ b/nni/runtime/platform/__init__.py
@@ -9,7 +9,7 @@ if trial_env_vars.NNI_PLATFORM is None:
    from .standalone import *
 elif trial_env_vars.NNI_PLATFORM == 'unittest':
    from .test import *
-elif trial_env_vars.NNI_PLATFORM in ('adl', 'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'):
+elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'):
    from .local import *
 else:
    raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM)
--- a/nni/runtime/platform/local.py
+++ b/nni/runtime/platform/local.py
@@ -19,6 +19,7 @@ _outputdir = trial_env_vars.NNI_OUTPUT_DIR
 if not os.path.exists(_outputdir):
    os.makedirs(_outputdir)
+_reuse_mode = trial_env_vars.REUSE_MODE
 _nni_platform = trial_env_vars.NNI_PLATFORM
 _multiphase = trial_env_vars.MULTI_PHASE
@@ -58,7 +59,7 @@ def get_next_parameter():
    return params
 def send_metric(string):
-    if _nni_platform != 'local':
+    if _nni_platform != 'local' or _reuse_mode in ('true', 'True'):
        assert len(string) < 1000000, 'Metric too long'
        print("NNISDK_MEb'%s'" % (string), flush=True)
    else:

--- a/nni/tools/nnictl/config_schema.py
+++ b/nni/tools/nnictl/config_schema.py
@@ -124,7 +124,7 @@ common_schema = {
    Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
    Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
    'trainingServicePlatform': setChoice(
-        'trainingServicePlatform', 'adl', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'),
+        'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'),
    Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'),
    Optional('multiPhase'): setType('multiPhase', bool),
    Optional('multiThread'): setType('multiThread', bool),
@@ -208,7 +208,7 @@ pai_trial_schema = {
 }
 pai_config_schema = {
-    'paiConfig': {
+    Optional('paiConfig'): {
        'userName': setType('userName', str),
        Or('passWord', 'token', only_one=True): str,
        'host': setType('host', str),
@@ -252,7 +252,7 @@ aml_trial_schema = {
 }
 aml_config_schema = {
-    'amlConfig': {
+    Optional('amlConfig'): {
        'subscriptionId': setType('subscriptionId', str),
        'resourceGroup': setType('resourceGroup', str),
        'workspaceName': setType('workspaceName', str),
@@ -262,6 +262,29 @@ aml_config_schema = {
    }
 }
+heterogeneous_trial_schema = {
+    'trial': {
+        'codeDir': setPathCheck('codeDir'),
+        Optional('nniManagerNFSMountPath'): setPathCheck('nniManagerNFSMountPath'),
+        Optional('containerNFSMountPath'): setType('containerNFSMountPath', str),
+        Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
+        'command': setType('command', str),
+        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999),
+        Optional('memoryMB'): setType('memoryMB', int),
+        Optional('image'): setType('image', str),
+        Optional('virtualCluster'): setType('virtualCluster', str),
+        Optional('paiStorageConfigName'): setType('paiStorageConfigName', str),
+        Optional('paiConfigPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'paiConfigPath')
+    }
+}
+heterogeneous_config_schema = {
+    'heterogeneousConfig': {
+        'trainingServicePlatforms': ['local', 'remote', 'pai', 'aml']
+    }
+}
 adl_trial_schema = {
    'trial':{
        'codeDir': setType('codeDir', str),
@@ -404,7 +427,7 @@ remote_config_schema = {
 }
 machine_list_schema = {
-    'machineList': [Or(
+    Optional('machineList'): [Or(
        {
            'ip': setType('ip', str),
            Optional('port'): setNumberRange('port', int, 1, 65535),
@@ -438,6 +461,8 @@ training_service_schema_dict = {
    'frameworkcontroller': Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}),
    'aml': Schema({**common_schema, **aml_trial_schema, **aml_config_schema}),
    'dlts': Schema({**common_schema, **dlts_trial_schema, **dlts_config_schema}),
+    'heterogeneous': Schema({**common_schema, **heterogeneous_trial_schema, **heterogeneous_config_schema, **machine_list_schema,
+                             **pai_config_schema, **aml_config_schema, **remote_config_schema}),
 }
@@ -454,6 +479,7 @@ class NNIConfigSchema:
        self.validate_pai_trial_conifg(experiment_config)
        self.validate_kubeflow_operators(experiment_config)
        self.validate_eth0_device(experiment_config)
+        self.validate_heterogeneous_platforms(experiment_config)
    def validate_tuner_adivosr_assessor(self, experiment_config):
        if experiment_config.get('advisor'):
@@ -563,3 +589,16 @@ class NNIConfigSchema:
                and not experiment_config.get('nniManagerIp') \
                and 'eth0' not in netifaces.interfaces():
            raise SchemaError('This machine does not contain eth0 network device, please set nniManagerIp in config file!')
+    def validate_heterogeneous_platforms(self, experiment_config):
+        required_config_name_map = {
+            'remote': 'machineList',
+            'aml': 'amlConfig',
+            'pai': 'paiConfig'
+        }
+        if experiment_config.get('trainingServicePlatform') == 'heterogeneous':
+            for platform in experiment_config['heterogeneousConfig']['trainingServicePlatforms']:
+                config_name = required_config_name_map.get(platform)
+                if config_name and not experiment_config.get(config_name):
+                    raise SchemaError('Need to set {0} for {1} in heterogeneous mode!'.format(config_name, platform))
\ No newline at end of file
--- a/nni/tools/nnictl/config_utils.py
+++ b/nni/tools/nnictl/config_utils.py
@@ -85,7 +85,10 @@ class Experiments:
            self.experiments = self.read_file()
            if expId not in self.experiments:
                return False
-            self.experiments[expId][key] = value
+            if value is None:
+                self.experiments[expId].pop(key, None)
+            else:
+                self.experiments[expId][key] = value
            self.write_file()
            return True

--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -118,13 +118,6 @@ def set_local_config(experiment_config, port, config_file_name):
    request_data = dict()
    if experiment_config.get('localConfig'):
        request_data['local_config'] = experiment_config['localConfig']
-        if request_data['local_config']:
-            if request_data['local_config'].get('gpuIndices') and isinstance(request_data['local_config'].get('gpuIndices'), int):
-                request_data['local_config']['gpuIndices'] = str(request_data['local_config'].get('gpuIndices'))
-            if request_data['local_config'].get('maxTrialNumOnEachGpu'):
-                request_data['local_config']['maxTrialNumOnEachGpu'] = request_data['local_config'].get('maxTrialNumOnEachGpu')
-            if request_data['local_config'].get('useActiveGpu'):
-                request_data['local_config']['useActiveGpu'] = request_data['local_config'].get('useActiveGpu')
        response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT)
        err_message = ''
        if not response or not check_response(response):
@@ -306,6 +299,37 @@ def set_aml_config(experiment_config, port, config_file_name):
    #set trial_config
    return set_trial_config(experiment_config, port, config_file_name), err_message
+def set_heterogeneous_config(experiment_config, port, config_file_name):
+    '''set heterogeneous configuration'''
+    heterogeneous_config_data = dict()
+    heterogeneous_config_data['heterogeneous_config'] = experiment_config['heterogeneousConfig']
+    platform_list = experiment_config['heterogeneousConfig']['trainingServicePlatforms']
+    for platform in platform_list:
+        if platform == 'aml':
+            heterogeneous_config_data['aml_config'] = experiment_config['amlConfig']
+        elif platform ==  'remote':
+            if experiment_config.get('remoteConfig'):
+                heterogeneous_config_data['remote_config'] = experiment_config['remoteConfig']
+            heterogeneous_config_data['machine_list'] = experiment_config['machineList']
+        elif platform == 'local' and experiment_config.get('localConfig'):
+            heterogeneous_config_data['local_config'] = experiment_config['localConfig']
+        elif platform == 'pai':
+            heterogeneous_config_data['pai_config'] = experiment_config['paiConfig']
+    response = rest_put(cluster_metadata_url(port), json.dumps(heterogeneous_config_data), REST_TIME_OUT)
+    err_message = None
+    if not response or not response.status_code == 200:
+        if response is not None:
+            err_message = response.text
+            _, stderr_full_path = get_log_path(config_file_name)
+            with open(stderr_full_path, 'a+') as fout:
+                fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
+        return False, err_message
+    result, message = setNNIManagerIp(experiment_config, port, config_file_name)
+    if not result:
+        return result, message
+    #set trial_config
+    return set_trial_config(experiment_config, port, config_file_name), err_message
 def set_experiment(experiment_config, mode, port, config_file_name):
    '''Call startExperiment (rest POST /experiment) with yaml file content'''
    request_data = dict()
@@ -387,6 +411,21 @@ def set_experiment(experiment_config, mode, port, config_file_name):
            {'key': 'aml_config', 'value': experiment_config['amlConfig']})
        request_data['clusterMetaData'].append(
            {'key': 'trial_config', 'value': experiment_config['trial']})
+    elif experiment_config['trainingServicePlatform'] == 'heterogeneous':
+        request_data['clusterMetaData'].append(
+            {'key': 'heterogeneous_config', 'value': experiment_config['heterogeneousConfig']})
+        platform_list = experiment_config['heterogeneousConfig']['trainingServicePlatforms']
+        request_dict = {
+            'aml': {'key': 'aml_config', 'value': experiment_config.get('amlConfig')},
+            'remote': {'key': 'machine_list', 'value': experiment_config.get('machineList')},
+            'pai': {'key': 'pai_config', 'value': experiment_config.get('paiConfig')},
+            'local': {'key': 'local_config', 'value': experiment_config.get('localConfig')}
+        }
+        for platform in platform_list:
+            if request_dict.get(platform):
+                request_data['clusterMetaData'].append(request_dict[platform])
+        request_data['clusterMetaData'].append(
+            {'key': 'trial_config', 'value': experiment_config['trial']})
    response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True)
    if check_response(response):
        return response
@@ -420,6 +459,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res
        config_result, err_msg = set_dlts_config(experiment_config, port, config_file_name)
    elif platform == 'aml':
        config_result, err_msg = set_aml_config(experiment_config, port, config_file_name)
+    elif platform == 'heterogeneous':
+        config_result, err_msg = set_heterogeneous_config(experiment_config, port, config_file_name)
    else:
        raise Exception(ERROR_INFO % 'Unsupported platform!')
        exit(1)