Unverified Commit 817ec68b authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Add native support for v2 config (#3466)

parent 6aaca5f7
...@@ -7,7 +7,7 @@ import logging ...@@ -7,7 +7,7 @@ import logging
import json import json
import base64 import base64
from .runtime.common import enable_multi_thread, enable_multi_phase from .runtime.common import enable_multi_thread
from .runtime.msg_dispatcher import MsgDispatcher from .runtime.msg_dispatcher import MsgDispatcher
from .tools.package_utils import create_builtin_class_instance, create_customized_class_instance from .tools.package_utils import create_builtin_class_instance, create_customized_class_instance
...@@ -29,10 +29,8 @@ def main(): ...@@ -29,10 +29,8 @@ def main():
exp_params = json.loads(exp_params_decode) exp_params = json.loads(exp_params_decode)
logger.debug('exp_params json obj: [%s]', json.dumps(exp_params, indent=4)) logger.debug('exp_params json obj: [%s]', json.dumps(exp_params, indent=4))
if exp_params.get('multiThread'): if exp_params.get('deprecated', {}).get('multiThread'):
enable_multi_thread() enable_multi_thread()
if exp_params.get('multiPhase'):
enable_multi_phase()
if exp_params.get('advisor') is not None: if exp_params.get('advisor') is not None:
# advisor is enabled and starts to run # advisor is enabled and starts to run
...@@ -61,10 +59,10 @@ def main(): ...@@ -61,10 +59,10 @@ def main():
def _run_advisor(exp_params): def _run_advisor(exp_params):
if exp_params.get('advisor').get('builtinAdvisorName'): if exp_params.get('advisor').get('name'):
dispatcher = create_builtin_class_instance( dispatcher = create_builtin_class_instance(
exp_params.get('advisor').get('builtinAdvisorName'), exp_params['advisor']['name'],
exp_params.get('advisor').get('classArgs'), exp_params['advisor'].get('classArgs'),
'advisors') 'advisors')
else: else:
dispatcher = create_customized_class_instance(exp_params.get('advisor')) dispatcher = create_customized_class_instance(exp_params.get('advisor'))
...@@ -78,26 +76,26 @@ def _run_advisor(exp_params): ...@@ -78,26 +76,26 @@ def _run_advisor(exp_params):
def _create_tuner(exp_params): def _create_tuner(exp_params):
if exp_params.get('tuner').get('builtinTunerName'): if exp_params['tuner'].get('name'):
tuner = create_builtin_class_instance( tuner = create_builtin_class_instance(
exp_params.get('tuner').get('builtinTunerName'), exp_params['tuner']['name'],
exp_params.get('tuner').get('classArgs'), exp_params['tuner'].get('classArgs'),
'tuners') 'tuners')
else: else:
tuner = create_customized_class_instance(exp_params.get('tuner')) tuner = create_customized_class_instance(exp_params['tuner'])
if tuner is None: if tuner is None:
raise AssertionError('Failed to create Tuner instance') raise AssertionError('Failed to create Tuner instance')
return tuner return tuner
def _create_assessor(exp_params): def _create_assessor(exp_params):
if exp_params.get('assessor').get('builtinAssessorName'): if exp_params['assessor'].get('name'):
assessor = create_builtin_class_instance( assessor = create_builtin_class_instance(
exp_params.get('assessor').get('builtinAssessorName'), exp_params['assessor']['name'],
exp_params.get('assessor').get('classArgs'), exp_params['assessor'].get('classArgs'),
'assessors') 'assessors')
else: else:
assessor = create_customized_class_instance(exp_params.get('assessor')) assessor = create_customized_class_instance(exp_params['assessor'])
if assessor is None: if assessor is None:
raise AssertionError('Failed to create Assessor instance') raise AssertionError('Failed to create Assessor instance')
return assessor return assessor
......
...@@ -9,3 +9,4 @@ from .aml import * ...@@ -9,3 +9,4 @@ from .aml import *
from .kubeflow import * from .kubeflow import *
from .frameworkcontroller import * from .frameworkcontroller import *
from .adl import * from .adl import *
from .shared_storage import *
...@@ -101,6 +101,8 @@ class ConfigBase: ...@@ -101,6 +101,8 @@ class ConfigBase:
elif isinstance(value, ConfigBase): elif isinstance(value, ConfigBase):
setattr(ret, key, value.canonical()) setattr(ret, key, value.canonical())
# value will be copied twice, should not be a performance issue anyway # value will be copied twice, should not be a performance issue anyway
elif isinstance(value, Path):
setattr(ret, key, str(value))
return ret return ret
def validate(self) -> None: def validate(self) -> None:
......
...@@ -5,6 +5,8 @@ from dataclasses import dataclass ...@@ -5,6 +5,8 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from ruamel.yaml import YAML
from .base import ConfigBase, PathLike from .base import ConfigBase, PathLike
from . import util from . import util
...@@ -27,23 +29,27 @@ class _AlgorithmConfig(ConfigBase): ...@@ -27,23 +29,27 @@ class _AlgorithmConfig(ConfigBase):
super().validate() super().validate()
_validate_algo(self) _validate_algo(self)
@dataclass(init=False) @dataclass(init=False)
class AlgorithmConfig(_AlgorithmConfig): class AlgorithmConfig(_AlgorithmConfig):
name: str name: str
class_args: Optional[Dict[str, Any]] = None class_args: Optional[Dict[str, Any]] = None
@dataclass(init=False) @dataclass(init=False)
class CustomAlgorithmConfig(_AlgorithmConfig): class CustomAlgorithmConfig(_AlgorithmConfig):
class_name: str class_name: str
class_directory: Optional[PathLike] = None class_directory: Optional[PathLike] = '.'
class_args: Optional[Dict[str, Any]] = None class_args: Optional[Dict[str, Any]] = None
class TrainingServiceConfig(ConfigBase): class TrainingServiceConfig(ConfigBase):
platform: str platform: str
class SharedStorageConfig(ConfigBase):
storage_type: str
local_mount_point: str
remote_mount_point: str
local_mounted: str
@dataclass(init=False) @dataclass(init=False)
class ExperimentConfig(ConfigBase): class ExperimentConfig(ConfigBase):
...@@ -53,19 +59,21 @@ class ExperimentConfig(ConfigBase): ...@@ -53,19 +59,21 @@ class ExperimentConfig(ConfigBase):
trial_command: str trial_command: str
trial_code_directory: PathLike = '.' trial_code_directory: PathLike = '.'
trial_concurrency: int trial_concurrency: int
trial_gpu_number: Optional[int] = None trial_gpu_number: Optional[int] = None # TODO: in openpai cannot be None
max_experiment_duration: Optional[str] = None max_experiment_duration: Optional[str] = None
max_trial_number: Optional[int] = None max_trial_number: Optional[int] = None
nni_manager_ip: Optional[str] = None nni_manager_ip: Optional[str] = None
use_annotation: bool = False use_annotation: bool = False
debug: bool = False debug: bool = False
log_level: Optional[str] = None log_level: Optional[str] = None
experiment_working_directory: Optional[PathLike] = None experiment_working_directory: PathLike = '~/nni-experiments'
tuner_gpu_indices: Optional[Union[List[int], str]] = None tuner_gpu_indices: Optional[Union[List[int], str]] = None
tuner: Optional[_AlgorithmConfig] = None tuner: Optional[_AlgorithmConfig] = None
assessor: Optional[_AlgorithmConfig] = None assessor: Optional[_AlgorithmConfig] = None
advisor: Optional[_AlgorithmConfig] = None advisor: Optional[_AlgorithmConfig] = None
training_service: Union[TrainingServiceConfig, List[TrainingServiceConfig]] training_service: Union[TrainingServiceConfig, List[TrainingServiceConfig]]
shared_storage: Optional[SharedStorageConfig] = None
_deprecated: Optional[Dict[str, Any]] = None
def __init__(self, training_service_platform: Optional[Union[str, List[str]]] = None, **kwargs): def __init__(self, training_service_platform: Optional[Union[str, List[str]]] = None, **kwargs):
base_path = kwargs.pop('_base_path', None) base_path = kwargs.pop('_base_path', None)
...@@ -100,6 +108,12 @@ class ExperimentConfig(ConfigBase): ...@@ -100,6 +108,12 @@ class ExperimentConfig(ConfigBase):
if self.training_service.use_active_gpu is None: if self.training_service.use_active_gpu is None:
raise ValueError('Please set "use_active_gpu"') raise ValueError('Please set "use_active_gpu"')
def json(self) -> Dict[str, Any]:
obj = super().json()
if obj.get('searchSpaceFile'):
obj['searchSpace'] = YAML().load(open(obj.pop('searchSpaceFile')))
return obj
## End of public API ## ## End of public API ##
@property @property
...@@ -117,9 +131,9 @@ _canonical_rules = { ...@@ -117,9 +131,9 @@ _canonical_rules = {
'max_experiment_duration': lambda value: f'{util.parse_time(value)}s' if value is not None else None, 'max_experiment_duration': lambda value: f'{util.parse_time(value)}s' if value is not None else None,
'experiment_working_directory': util.canonical_path, 'experiment_working_directory': util.canonical_path,
'tuner_gpu_indices': lambda value: [int(idx) for idx in value.split(',')] if isinstance(value, str) else value, 'tuner_gpu_indices': lambda value: [int(idx) for idx in value.split(',')] if isinstance(value, str) else value,
'tuner': lambda config: None if config is None or config.name == '_none_' else config, 'tuner': lambda config: None if config is None or config.name == '_none_' else config.canonical(),
'assessor': lambda config: None if config is None or config.name == '_none_' else config, 'assessor': lambda config: None if config is None or config.name == '_none_' else config.canonical(),
'advisor': lambda config: None if config is None or config.name == '_none_' else config, 'advisor': lambda config: None if config is None or config.name == '_none_' else config.canonical(),
} }
_validation_rules = { _validation_rules = {
......
This diff is collapsed.
...@@ -56,7 +56,7 @@ class KubeflowConfig(TrainingServiceConfig): ...@@ -56,7 +56,7 @@ class KubeflowConfig(TrainingServiceConfig):
parameter_server: Optional[KubeflowRoleConfig] = None parameter_server: Optional[KubeflowRoleConfig] = None
def __init__(self, **kwargs): def __init__(self, **kwargs):
kwargs = util.case_insensitve(kwargs) kwargs = util.case_insensitive(kwargs)
kwargs['storage'] = util.load_config(_KubeflowStorageConfig, kwargs.get('storage')) kwargs['storage'] = util.load_config(_KubeflowStorageConfig, kwargs.get('storage'))
kwargs['worker'] = util.load_config(KubeflowRoleConfig, kwargs.get('worker')) kwargs['worker'] = util.load_config(KubeflowRoleConfig, kwargs.get('worker'))
kwargs['parameterserver'] = util.load_config(KubeflowRoleConfig, kwargs.get('parameterserver')) kwargs['parameterserver'] = util.load_config(KubeflowRoleConfig, kwargs.get('parameterserver'))
......
...@@ -23,7 +23,7 @@ class OpenpaiConfig(TrainingServiceConfig): ...@@ -23,7 +23,7 @@ class OpenpaiConfig(TrainingServiceConfig):
docker_image: str = 'msranni/nni:latest' docker_image: str = 'msranni/nni:latest'
local_storage_mount_point: PathLike local_storage_mount_point: PathLike
container_storage_mount_point: str container_storage_mount_point: str
reuse_mode: bool = False reuse_mode: bool = True
openpai_config: Optional[Dict[str, Any]] = None openpai_config: Optional[Dict[str, Any]] = None
openpai_config_file: Optional[PathLike] = None openpai_config_file: Optional[PathLike] = None
......
...@@ -46,7 +46,7 @@ class RemoteMachineConfig(ConfigBase): ...@@ -46,7 +46,7 @@ class RemoteMachineConfig(ConfigBase):
@dataclass(init=False) @dataclass(init=False)
class RemoteConfig(TrainingServiceConfig): class RemoteConfig(TrainingServiceConfig):
platform: str = 'remote' platform: str = 'remote'
reuse_mode: bool = False reuse_mode: bool = True
machine_list: List[RemoteMachineConfig] machine_list: List[RemoteMachineConfig]
def __init__(self, **kwargs): def __init__(self, **kwargs):
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from dataclasses import dataclass
from typing import Optional
from .common import SharedStorageConfig
__all__ = ['NfsConfig', 'AzureBlobConfig']
@dataclass(init=False)
class NfsConfig(SharedStorageConfig):
storage_type: str = 'NFS'
nfs_server: str
exported_directory: str
@dataclass(init=False)
class AzureBlobConfig(SharedStorageConfig):
storage_type: str = 'AzureBlob'
storage_account_name: str
storage_account_key: Optional[str] = None
resource_group_name: Optional[str] = None
container_name: str
...@@ -19,7 +19,7 @@ def case_insensitive(key_or_kwargs: Union[str, Dict[str, Any]]) -> Union[str, Di ...@@ -19,7 +19,7 @@ def case_insensitive(key_or_kwargs: Union[str, Dict[str, Any]]) -> Union[str, Di
return {key.lower().replace('_', ''): value for key, value in key_or_kwargs.items()} return {key.lower().replace('_', ''): value for key, value in key_or_kwargs.items()}
def camel_case(key: str) -> str: def camel_case(key: str) -> str:
words = key.split('_') words = key.strip('_').split('_')
return words[0] + ''.join(word.title() for word in words[1:]) return words[0] + ''.join(word.title() for word in words[1:])
def canonical_path(path: Optional[PathLike]) -> Optional[str]: def canonical_path(path: Optional[PathLike]) -> Optional[str]:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import contextlib import contextlib
import logging import logging
from pathlib import Path from pathlib import Path
...@@ -13,7 +16,6 @@ import nni_node # pylint: disable=import-error ...@@ -13,7 +16,6 @@ import nni_node # pylint: disable=import-error
import nni.runtime.protocol import nni.runtime.protocol
from .config import ExperimentConfig from .config import ExperimentConfig
from .config import convert
from .pipe import Pipe from .pipe import Pipe
from . import rest from . import rest
from ..tools.nnictl.config_utils import Experiments from ..tools.nnictl.config_utils import Experiments
...@@ -40,7 +42,7 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo ...@@ -40,7 +42,7 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo
_save_experiment_information(exp_id, port, start_time, platform, _save_experiment_information(exp_id, port, start_time, platform,
config.experiment_name, proc.pid, config.experiment_working_directory) config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...') _logger.info('Setting up...')
_init_experiment(config, port, debug) rest.post(port, '/experiment', config.json())
return proc return proc
except Exception as e: except Exception as e:
...@@ -75,7 +77,7 @@ def start_experiment_retiarii(exp_id: str, config: ExperimentConfig, port: int, ...@@ -75,7 +77,7 @@ def start_experiment_retiarii(exp_id: str, config: ExperimentConfig, port: int,
_save_experiment_information(exp_id, port, start_time, platform, _save_experiment_information(exp_id, port, start_time, platform,
config.experiment_name, proc.pid, config.experiment_working_directory) config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...') _logger.info('Setting up...')
_init_experiment(config, port, debug) rest.post(port, '/experiment', config.json())
return proc, pipe return proc, pipe
except Exception as e: except Exception as e:
...@@ -145,12 +147,6 @@ def _check_rest_server(port: int, retry: int = 3) -> None: ...@@ -145,12 +147,6 @@ def _check_rest_server(port: int, retry: int = 3) -> None:
rest.get(port, '/check-status') rest.get(port, '/check-status')
def _init_experiment(config: ExperimentConfig, port: int, debug: bool) -> None:
for cluster_metadata in convert.to_cluster_metadata(config):
rest.put(port, '/experiment/cluster-metadata', cluster_metadata)
rest.post(port, '/experiment', convert.to_rest_json(config))
def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None: def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None:
experiments_config = Experiments() experiments_config = Experiments()
experiments_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir) experiments_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir)
...@@ -35,11 +35,17 @@ def verify_algo_import(meta): ...@@ -35,11 +35,17 @@ def verify_algo_import(meta):
def algo_reg(args): def algo_reg(args):
meta_list = read_reg_meta_list(args.meta_path) meta_list = read_reg_meta_list(args.meta_path)
for meta in meta_list: for meta in meta_list:
if get_registered_algo_meta(meta['builtinName']) is not None: old = get_registered_algo_meta(meta['builtinName'])
print_error('builtinName {} already registered'.format(meta['builtinName'])) if old is None:
return verify_algo_import(meta)
verify_algo_import(meta) save_algo_meta_data(meta)
save_algo_meta_data(meta) elif old['source'] != 'nni':
verify_algo_import(meta)
print_green(f'Updating exist algorithm')
remove_algo_meta_data(meta['builtinName'])
save_algo_meta_data(meta)
else:
print_error(f'Cannot overwrite builtin algorithm')
print_green('{} registered sucessfully!'.format(meta['builtinName'])) print_green('{} registered sucessfully!'.format(meta['builtinName']))
def algo_unreg(args): def algo_unreg(args):
......
This diff is collapsed.
...@@ -124,4 +124,5 @@ def validate_all_content(experiment_config, config_path): ...@@ -124,4 +124,5 @@ def validate_all_content(experiment_config, config_path):
NNIConfigSchema().validate(experiment_config) NNIConfigSchema().validate(experiment_config)
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration']) if 'maxExecDuration' in experiment_config:
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
...@@ -178,25 +178,24 @@ def create_customized_class_instance(class_params): ...@@ -178,25 +178,24 @@ def create_customized_class_instance(class_params):
---------- ----------
class_params: dict class_params: dict
class_params should contains following keys: class_params should contains following keys:
codeDir: code directory codeDirectory: code directory
classFileName: python file name of the class className: qualified class name
className: class name
classArgs (optional): kwargs pass to class constructor classArgs (optional): kwargs pass to class constructor
Returns: object Returns: object
------- -------
Returns customized class instance. Returns customized class instance.
""" """
code_dir = class_params.get('codeDir') code_dir = class_params.get('classDirectory')
class_filename = class_params.get('classFileName') qualified_class_name = class_params.get('className')
class_name = class_params.get('className')
class_args = class_params.get('classArgs') class_args = class_params.get('classArgs')
if not os.path.isfile(os.path.join(code_dir, class_filename)): if code_dir and not os.path.isdir(code_dir):
raise ValueError('Class file not found: {}'.format( raise ValueError(f'Directory not found: {code_dir}')
os.path.join(code_dir, class_filename)))
sys.path.append(code_dir) sys.path.append(code_dir)
module_name = os.path.splitext(class_filename)[0] module_name, class_name = qualified_class_name.rsplit('.', 1)
class_module = importlib.import_module(module_name) class_module = importlib.import_module(module_name)
class_constructor = getattr(class_module, class_name) class_constructor = getattr(class_module, class_name)
......
...@@ -45,13 +45,6 @@ testCases: ...@@ -45,13 +45,6 @@ testCases:
- name: multi-thread - name: multi-thread
configFile: test/config/multi_thread/config.yml configFile: test/config/multi_thread/config.yml
- name: multi-phase-batch
configFile: test/config/multi_phase/batch.yml
config:
# for batch tuner, maxTrialNum can not exceed length of search space
maxTrialNum: 2
trialConcurrency: 2
######################################################################### #########################################################################
# nni assessor test # nni assessor test
######################################################################### #########################################################################
......
...@@ -30,7 +30,8 @@ ...@@ -30,7 +30,8 @@
"argsIgnorePattern": "^_" "argsIgnorePattern": "^_"
} }
], ],
"@typescript-eslint/no-var-requires": 0 "@typescript-eslint/no-var-requires": 0,
"@typescript-eslint/no-non-null-assertion": 0
}, },
"ignorePatterns": [ "ignorePatterns": [
"node_modules/", "node_modules/",
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as assert from 'assert';
export interface TrainingServiceConfig {
platform: string;
}
/* Local */
export interface LocalConfig extends TrainingServiceConfig {
platform: 'local';
useActiveGpu?: boolean;
maxTrialNumberPerGpu: number;
gpuIndices?: number[];
}
/* Remote */
export interface RemoteMachineConfig {
host: string;
port: number;
user: string;
password?: string;
sshKeyFile: string;
sshPassphrase?: string;
useActiveGpu: boolean;
maxTrialNumberPerGpu: number;
gpuIndices?: number[];
pythonPath?: string;
}
export interface RemoteConfig extends TrainingServiceConfig {
platform: 'remote';
reuseMode: boolean;
machineList: RemoteMachineConfig[];
}
/* OpenPAI */
export interface OpenpaiConfig extends TrainingServiceConfig {
platform: 'openpai';
host: string;
username: string;
token: string;
trialCpuNumber: number;
trialMemorySize: string;
storageConfigName: string;
dockerImage: string;
localStorageMountPoint: string;
containerStorageMountPoint: string;
reuseMode: boolean;
openpaiConfig?: object;
}
/* AML */
export interface AmlConfig extends TrainingServiceConfig {
platform: 'aml';
subscriptionId: string;
resourceGroup: string;
workspaceName: string;
computeTarget: string;
dockerImage: string;
}
/* Kubeflow */
// FIXME: merge with shared storage config
export interface KubeflowStorageConfig {
storage: string;
server?: string;
path?: string;
azureAccount?: string;
azureShare?: string;
keyVault?: string;
keyVaultSecret?: string;
}
export interface KubeflowRoleConfig {
replicas: number;
command: string;
gpuNumber: number;
cpuNumber: number;
memorySize: string;
dockerImage: string;
}
export interface KubeflowConfig extends TrainingServiceConfig {
platform: 'kubeflow';
operator: string;
apiVersion: string;
storage: KubeflowStorageConfig;
worker: KubeflowRoleConfig;
parameterServer?: KubeflowRoleConfig;
}
/* FrameworkController */
type FrameworkControllerStorageConfig = KubeflowStorageConfig;
export interface FrameworkControllerRoleConfig {
name: string;
dockerImage: string;
taskNumber: number;
command: string;
gpuNumber: number;
cpuNumber: number;
memorySize: string;
attemptCompletionMinFailedTasks: number;
attemptCompletionMinSucceededTasks: number;
}
export interface FrameworkControllerConfig extends TrainingServiceConfig {
platform: 'frameworkcontroller';
serviceAccountName: string;
storage: FrameworkControllerStorageConfig;
taskRoles: FrameworkControllerRoleConfig[];
}
/* shared storage */
export interface SharedStorageConfig {
storageType: string;
localMountPoint: string;
remoteMountPoint: string;
localMounted: string;
}
export interface NfsConfig extends SharedStorageConfig {
storageType: 'NFS';
nfsServer: string;
exportedDirectory: string;
}
export interface AzureBlobConfig extends SharedStorageConfig {
storageAccountName: string;
storageAccountKey?: string;
resourceGroupName?: string;
containerName: string;
}
/* common */
export interface AlgorithmConfig {
name?: string;
className?: string;
codeDirectory?: string;
classArgs?: object;
}
export interface ExperimentConfig {
experimentName?: string;
searchSpace: any;
trialCommand: string;
trialCodeDirectory: string;
trialConcurrency: number;
trialGpuNumber?: number;
maxExperimentDuration?: string;
maxTrialNumber?: number;
nniManagerIp?: string;
//useAnnotation: boolean; // dealed inside nnictl
debug: boolean;
logLevel?: string;
experimentWorkingDirectory?: string;
tunerGpuIndices?: number[];
tuner?: AlgorithmConfig;
assessor?: AlgorithmConfig;
advisor?: AlgorithmConfig;
trainingService: TrainingServiceConfig | TrainingServiceConfig[];
sharedStorage?: SharedStorageConfig;
deprecated?: any; // configs that are not yet natively supported by v2 (workaround)
}
/* util functions */
const timeUnits = { d: 24 * 3600, h: 3600, m: 60, s: 1 };
export function toSeconds(time: string): number {
for (const [unit, factor] of Object.entries(timeUnits)) {
if (time.toLowerCase().endsWith(unit)) {
const digits = time.slice(0, -1);
return Number(digits) * factor;
}
}
throw new Error(`Bad time string "${time}"`);
}
const sizeUnits = { tb: 1024 * 1024, gb: 1024 * 1024, mb: 1, kb: 1 / 1024 };
export function toMegaBytes(size: string): number {
for (const [unit, factor] of Object.entries(sizeUnits)) {
if (size.toLowerCase().endsWith(unit)) {
const digits = size.slice(0, -2);
return Math.floor(Number(digits) * factor);
}
}
throw new Error(`Bad size string "${size}"`);
}
export function toCudaVisibleDevices(gpuIndices?: number[]): string {
return gpuIndices === undefined ? '' : gpuIndices.join(',');
}
export function flattenConfig<T>(config: ExperimentConfig, platform: string): T {
const flattened = { };
Object.assign(flattened, config);
if (Array.isArray(config.trainingService)) {
for (const trainingService of config.trainingService) {
if (trainingService.platform === platform) {
Object.assign(flattened, trainingService);
}
}
} else {
assert(config.trainingService.platform === platform);
Object.assign(flattened, config.trainingService);
}
return <T>flattened;
}
...@@ -17,8 +17,14 @@ const INFO: number = 4; ...@@ -17,8 +17,14 @@ const INFO: number = 4;
const DEBUG: number = 5; const DEBUG: number = 5;
const TRACE: number = 6; const TRACE: number = 6;
const logLevelNameMap: Map<string, number> = new Map([['fatal', FATAL], const logLevelNameMap: Map<string, number> = new Map([
['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]); ['fatal', FATAL],
['error', ERROR],
['warning', WARNING],
['info', INFO],
['debug', DEBUG],
['trace', TRACE]
]);
class BufferSerialEmitter { class BufferSerialEmitter {
private buffer: Buffer; private buffer: Buffer;
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore'; import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore';
import { TrialJobStatus, LogType } from './trainingService'; import { TrialJobStatus, LogType } from './trainingService';
import { ExperimentConfig } from './experimentConfig';
type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM'; type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL'; type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
...@@ -13,58 +14,12 @@ namespace ExperimentStartUpMode { ...@@ -13,58 +14,12 @@ namespace ExperimentStartUpMode {
export const RESUME = 'resume'; export const RESUME = 'resume';
} }
interface ExperimentParams {
authorName: string;
experimentName: string;
description?: string;
trialConcurrency: number;
maxExecDuration: number; //seconds
maxTrialNum: number;
searchSpace: string;
trainingServicePlatform: string;
multiPhase?: boolean;
multiThread?: boolean;
versionCheck?: boolean;
logCollection?: string;
tuner?: {
className?: string;
builtinTunerName?: string;
codeDir?: string;
classArgs?: any;
classFileName?: string;
checkpointDir: string;
includeIntermediateResults?: boolean;
gpuIndices?: string;
};
assessor?: {
className?: string;
builtinAssessorName?: string;
codeDir?: string;
classArgs?: any;
classFileName?: string;
checkpointDir: string;
};
advisor?: {
className?: string;
builtinAdvisorName?: string;
codeDir?: string;
classArgs?: any;
classFileName?: string;
checkpointDir: string;
gpuIndices?: string;
};
clusterMetaData?: {
key: string;
value: string;
}[];
}
interface ExperimentProfile { interface ExperimentProfile {
params: ExperimentParams; params: ExperimentConfig;
id: string; id: string;
execDuration: number; execDuration: number;
logDir?: string; logDir: string;
startTime?: number; startTime: number;
endTime?: number; endTime?: number;
nextSequenceId: number; nextSequenceId: number;
revision: number; revision: number;
...@@ -81,7 +36,7 @@ interface NNIManagerStatus { ...@@ -81,7 +36,7 @@ interface NNIManagerStatus {
} }
abstract class Manager { abstract class Manager {
public abstract startExperiment(experimentParams: ExperimentParams): Promise<string>; public abstract startExperiment(experimentConfig: ExperimentConfig): Promise<string>;
public abstract resumeExperiment(readonly: boolean): Promise<void>; public abstract resumeExperiment(readonly: boolean): Promise<void>;
public abstract stopExperiment(): Promise<void>; public abstract stopExperiment(): Promise<void>;
public abstract stopExperimentTopHalf(): Promise<void>; public abstract stopExperimentTopHalf(): Promise<void>;
...@@ -113,4 +68,4 @@ abstract class Manager { ...@@ -113,4 +68,4 @@ abstract class Manager {
public abstract fetchTrialOutput(trialJobId: string, subpath: string): Promise<void>; public abstract fetchTrialOutput(trialJobId: string, subpath: string): Promise<void>;
} }
export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus, ExperimentStartUpMode }; export { Manager, ExperimentConfig, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus, ExperimentStartUpMode };
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment