Unverified Commit c702241e authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Rename heterogeneous to hybrid (#3252)

parent b4f0d321
**Run an Experiment on Heterogeneous Mode**
**Run an Experiment on Hybrid Mode**
===========================================
Run NNI on heterogeneous mode means that NNI will run trials jobs in multiple kinds of training platforms. For example, NNI could submit trial jobs to remote machine and AML simultaneously.
Run NNI on hybrid mode means that NNI will run trials jobs in multiple kinds of training platforms. For example, NNI could submit trial jobs to remote machine and AML simultaneously.
Setup environment
-----------------
NNI has supported `local <./LocalMode.rst>`__\ , `remote <./RemoteMachineMode.rst>`__\ , `PAI <./PaiMode.rst>`__\ , and `AML <./AMLMode.rst>`__ for heterogeneous training service. Before starting an experiment using these mode, users should setup the corresponding environment for the platforms. More details about the environment setup could be found in the corresponding docs.
NNI has supported `local <./LocalMode.rst>`__\ , `remote <./RemoteMachineMode.rst>`__\ , `PAI <./PaiMode.rst>`__\ , and `AML <./AMLMode.rst>`__ for hybrid training service. Before starting an experiment using these mode, users should setup the corresponding environment for the platforms. More details about the environment setup could be found in the corresponding docs.
Run an experiment
-----------------
......@@ -20,7 +20,7 @@ Use ``examples/trials/mnist-tfv1`` as an example. The NNI config YAML file's con
trialConcurrency: 2
maxExecDuration: 1h
maxTrialNum: 10
trainingServicePlatform: heterogeneous
trainingServicePlatform: hybrid
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
......@@ -33,7 +33,7 @@ Use ``examples/trials/mnist-tfv1`` as an example. The NNI config YAML file's con
command: python3 mnist.py
codeDir: .
gpuNum: 1
heterogeneousConfig:
hybridConfig:
trainingServicePlatforms:
- local
- remote
......@@ -44,11 +44,11 @@ Use ``examples/trials/mnist-tfv1`` as an example. The NNI config YAML file's con
username: bob
passwd: bob123
Configurations for heterogeneous mode:
Configurations for hybrid mode:
heterogeneousConfig:
hybridConfig:
* trainingServicePlatforms. required key. This field specify the platforms used in heterogeneous mode, the values using yaml list format. NNI support setting ``local``, ``remote``, ``aml``, ``pai`` in this field.
* trainingServicePlatforms. required key. This field specify the platforms used in hybrid mode, the values using yaml list format. NNI support setting ``local``, ``remote``, ``aml``, ``pai`` in this field.
.. Note:: If setting a platform in trainingServicePlatforms mode, users should also set the corresponding configuration for the platform. For example, if set ``remote`` as one of the platform, should also set ``machineList`` and ``remoteConfig`` configuration.
.. Note:: If setting a platform in trainingServicePlatforms mode, users should also set the corresponding configuration for the platform. For example, if set ``remote`` as one of the platform, should also set ``machineList`` and ``remoteConfig`` configuration. Local platform in hybrid mode does not support windows for now.
......@@ -11,4 +11,4 @@ Introduction to NNI Training Services
FrameworkController<./TrainingService/FrameworkControllerMode>
DLTS<./TrainingService/DLTSMode>
AML<./TrainingService/AMLMode>
Heterogeneous<./TrainingService/HeterogeneousMode>
Hybrid<./TrainingService/HybridMode>
......@@ -3,7 +3,7 @@ experimentName: example_mnist
trialConcurrency: 3
maxExecDuration: 1h
maxTrialNum: 10
trainingServicePlatform: heterogeneous
trainingServicePlatform: hybrid
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
......@@ -18,7 +18,7 @@ trial:
command: python3 mnist.py
codeDir: .
gpuNum: 0
heterogeneousConfig:
hybridConfig:
trainingServicePlatforms:
- local
- remote
......
......@@ -9,7 +9,7 @@ if trial_env_vars.NNI_PLATFORM is None:
from .standalone import *
elif trial_env_vars.NNI_PLATFORM == 'unittest':
from .test import *
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'):
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'):
from .local import *
else:
raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM)
......@@ -124,7 +124,7 @@ common_schema = {
Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
'trainingServicePlatform': setChoice(
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'),
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'),
Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'),
Optional('multiPhase'): setType('multiPhase', bool),
Optional('multiThread'): setType('multiThread', bool),
......@@ -262,7 +262,7 @@ aml_config_schema = {
}
}
heterogeneous_trial_schema = {
hybrid_trial_schema = {
'trial': {
'codeDir': setPathCheck('codeDir'),
Optional('nniManagerNFSMountPath'): setPathCheck('nniManagerNFSMountPath'),
......@@ -279,8 +279,8 @@ heterogeneous_trial_schema = {
}
}
heterogeneous_config_schema = {
'heterogeneousConfig': {
hybrid_config_schema = {
'hybridConfig': {
'trainingServicePlatforms': ['local', 'remote', 'pai', 'aml']
}
}
......@@ -461,7 +461,7 @@ training_service_schema_dict = {
'frameworkcontroller': Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}),
'aml': Schema({**common_schema, **aml_trial_schema, **aml_config_schema}),
'dlts': Schema({**common_schema, **dlts_trial_schema, **dlts_config_schema}),
'heterogeneous': Schema({**common_schema, **heterogeneous_trial_schema, **heterogeneous_config_schema, **machine_list_schema,
'hybrid': Schema({**common_schema, **hybrid_trial_schema, **hybrid_config_schema, **machine_list_schema,
**pai_config_schema, **aml_config_schema, **remote_config_schema}),
}
......@@ -479,7 +479,7 @@ class NNIConfigSchema:
self.validate_pai_trial_conifg(experiment_config)
self.validate_kubeflow_operators(experiment_config)
self.validate_eth0_device(experiment_config)
self.validate_heterogeneous_platforms(experiment_config)
self.validate_hybrid_platforms(experiment_config)
def validate_tuner_adivosr_assessor(self, experiment_config):
if experiment_config.get('advisor'):
......@@ -590,15 +590,15 @@ class NNIConfigSchema:
and 'eth0' not in netifaces.interfaces():
raise SchemaError('This machine does not contain eth0 network device, please set nniManagerIp in config file!')
def validate_heterogeneous_platforms(self, experiment_config):
def validate_hybrid_platforms(self, experiment_config):
required_config_name_map = {
'remote': 'machineList',
'aml': 'amlConfig',
'pai': 'paiConfig'
}
if experiment_config.get('trainingServicePlatform') == 'heterogeneous':
for platform in experiment_config['heterogeneousConfig']['trainingServicePlatforms']:
if experiment_config.get('trainingServicePlatform') == 'hybrid':
for platform in experiment_config['hybridConfig']['trainingServicePlatforms']:
config_name = required_config_name_map.get(platform)
if config_name and not experiment_config.get(config_name):
raise SchemaError('Need to set {0} for {1} in heterogeneous mode!'.format(config_name, platform))
raise SchemaError('Need to set {0} for {1} in hybrid mode!'.format(config_name, platform))
\ No newline at end of file
......@@ -300,23 +300,23 @@ def set_aml_config(experiment_config, port, config_file_name):
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message
def set_heterogeneous_config(experiment_config, port, config_file_name):
'''set heterogeneous configuration'''
heterogeneous_config_data = dict()
heterogeneous_config_data['heterogeneous_config'] = experiment_config['heterogeneousConfig']
platform_list = experiment_config['heterogeneousConfig']['trainingServicePlatforms']
def set_hybrid_config(experiment_config, port, config_file_name):
'''set hybrid configuration'''
hybrid_config_data = dict()
hybrid_config_data['hybrid_config'] = experiment_config['hybridConfig']
platform_list = experiment_config['hybridConfig']['trainingServicePlatforms']
for platform in platform_list:
if platform == 'aml':
heterogeneous_config_data['aml_config'] = experiment_config['amlConfig']
hybrid_config_data['aml_config'] = experiment_config['amlConfig']
elif platform == 'remote':
if experiment_config.get('remoteConfig'):
heterogeneous_config_data['remote_config'] = experiment_config['remoteConfig']
heterogeneous_config_data['machine_list'] = experiment_config['machineList']
hybrid_config_data['remote_config'] = experiment_config['remoteConfig']
hybrid_config_data['machine_list'] = experiment_config['machineList']
elif platform == 'local' and experiment_config.get('localConfig'):
heterogeneous_config_data['local_config'] = experiment_config['localConfig']
hybrid_config_data['local_config'] = experiment_config['localConfig']
elif platform == 'pai':
heterogeneous_config_data['pai_config'] = experiment_config['paiConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(heterogeneous_config_data), REST_TIME_OUT)
hybrid_config_data['pai_config'] = experiment_config['paiConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(hybrid_config_data), REST_TIME_OUT)
err_message = None
if not response or not response.status_code == 200:
if response is not None:
......@@ -412,10 +412,10 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'aml_config', 'value': experiment_config['amlConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'heterogeneous':
elif experiment_config['trainingServicePlatform'] == 'hybrid':
request_data['clusterMetaData'].append(
{'key': 'heterogeneous_config', 'value': experiment_config['heterogeneousConfig']})
platform_list = experiment_config['heterogeneousConfig']['trainingServicePlatforms']
{'key': 'hybrid_config', 'value': experiment_config['hybridConfig']})
platform_list = experiment_config['hybridConfig']['trainingServicePlatforms']
request_dict = {
'aml': {'key': 'aml_config', 'value': experiment_config.get('amlConfig')},
'remote': {'key': 'machine_list', 'value': experiment_config.get('machineList')},
......@@ -460,8 +460,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res
config_result, err_msg = set_dlts_config(experiment_config, port, config_file_name)
elif platform == 'aml':
config_result, err_msg = set_aml_config(experiment_config, port, config_file_name)
elif platform == 'heterogeneous':
config_result, err_msg = set_heterogeneous_config(experiment_config, port, config_file_name)
elif platform == 'hybrid':
config_result, err_msg = set_hybrid_config(experiment_config, port, config_file_name)
else:
raise Exception(ERROR_INFO % 'Unsupported platform!')
exit(1)
......
......@@ -37,7 +37,7 @@ function initStartupInfo(
}
async function initContainer(foreground: boolean, platformMode: string, logFileName?: string): Promise<void> {
const routerPlatformMode = ['remote', 'pai', 'aml', 'heterogeneous'];
const routerPlatformMode = ['remote', 'pai', 'aml', 'hybrid'];
if (routerPlatformMode.includes(platformMode)) {
Container.bind(TrainingService)
.to(RouterTrainingService)
......@@ -97,7 +97,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
function usage(): void {
console.info('usage: node main.js --port <port> --mode \
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml/adl/heterogeneous> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
}
const strPort: string = parseArg(['--port', '-p']);
......@@ -117,7 +117,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const port: number = parseInt(strPort, 10);
const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'heterogeneous'].includes(mode)) {
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`);
usage();
process.exit(1);
......
......@@ -183,7 +183,7 @@ export namespace ValidationSchemas {
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean()
}),
heterogeneous_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
hybrid_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
trainingServicePlatforms: joi.array(),
}),
nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
......
......@@ -11,7 +11,7 @@ export enum TrialConfigMetadataKey {
LOCAL_CONFIG = 'local_config',
TRIAL_CONFIG = 'trial_config',
REMOTE_CONFIG = 'remote_config',
HETEROGENEOUS_CONFIG = 'heterogeneous_config',
HYBRID_CONFIG = 'hybrid_config',
EXPERIMENT_ID = 'experimentId',
MULTI_PHASE = 'multiPhase',
RANDOM_SCHEDULER = 'random_scheduler',
......@@ -24,7 +24,7 @@ export enum TrialConfigMetadataKey {
AML_CLUSTER_CONFIG = 'aml_config',
VERSION_CHECK = 'version_check',
LOG_COLLECTION = 'log_collection',
// Used to set platform for heterogeneous in reuse mode,
// Used to set platform for hybrid in reuse mode,
// temproarily change and will refactor config schema in the future
PLATFORM_LIST = 'platform_list'
}
......@@ -95,8 +95,8 @@ class RouterTrainingService implements TrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> {
if (this.internalTrainingService === undefined) {
// Need to refactor configuration, remove heterogeneous_config field in the future
if (key === TrialConfigMetadataKey.HETEROGENEOUS_CONFIG){
// Need to refactor configuration, remove hybrid_config field in the future
if (key === TrialConfigMetadataKey.HYBRID_CONFIG){
this.internalTrainingService = component.get(TrialDispatcher);
const heterogenousConfig: HeterogenousConfig = <HeterogenousConfig>JSON.parse(value);
if (this.internalTrainingService === undefined) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment