Unverified Commit 817ec68b authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Add native support for v2 config (#3466)

parent 6aaca5f7
......@@ -7,7 +7,7 @@ import logging
import json
import base64
from .runtime.common import enable_multi_thread, enable_multi_phase
from .runtime.common import enable_multi_thread
from .runtime.msg_dispatcher import MsgDispatcher
from .tools.package_utils import create_builtin_class_instance, create_customized_class_instance
......@@ -29,10 +29,8 @@ def main():
exp_params = json.loads(exp_params_decode)
logger.debug('exp_params json obj: [%s]', json.dumps(exp_params, indent=4))
if exp_params.get('multiThread'):
if exp_params.get('deprecated', {}).get('multiThread'):
enable_multi_thread()
if exp_params.get('multiPhase'):
enable_multi_phase()
if exp_params.get('advisor') is not None:
# advisor is enabled and starts to run
......@@ -61,10 +59,10 @@ def main():
def _run_advisor(exp_params):
if exp_params.get('advisor').get('builtinAdvisorName'):
if exp_params.get('advisor').get('name'):
dispatcher = create_builtin_class_instance(
exp_params.get('advisor').get('builtinAdvisorName'),
exp_params.get('advisor').get('classArgs'),
exp_params['advisor']['name'],
exp_params['advisor'].get('classArgs'),
'advisors')
else:
dispatcher = create_customized_class_instance(exp_params.get('advisor'))
......@@ -78,26 +76,26 @@ def _run_advisor(exp_params):
def _create_tuner(exp_params):
if exp_params.get('tuner').get('builtinTunerName'):
if exp_params['tuner'].get('name'):
tuner = create_builtin_class_instance(
exp_params.get('tuner').get('builtinTunerName'),
exp_params.get('tuner').get('classArgs'),
exp_params['tuner']['name'],
exp_params['tuner'].get('classArgs'),
'tuners')
else:
tuner = create_customized_class_instance(exp_params.get('tuner'))
tuner = create_customized_class_instance(exp_params['tuner'])
if tuner is None:
raise AssertionError('Failed to create Tuner instance')
return tuner
def _create_assessor(exp_params):
if exp_params.get('assessor').get('builtinAssessorName'):
if exp_params['assessor'].get('name'):
assessor = create_builtin_class_instance(
exp_params.get('assessor').get('builtinAssessorName'),
exp_params.get('assessor').get('classArgs'),
exp_params['assessor']['name'],
exp_params['assessor'].get('classArgs'),
'assessors')
else:
assessor = create_customized_class_instance(exp_params.get('assessor'))
assessor = create_customized_class_instance(exp_params['assessor'])
if assessor is None:
raise AssertionError('Failed to create Assessor instance')
return assessor
......
......@@ -9,3 +9,4 @@ from .aml import *
from .kubeflow import *
from .frameworkcontroller import *
from .adl import *
from .shared_storage import *
......@@ -101,6 +101,8 @@ class ConfigBase:
elif isinstance(value, ConfigBase):
setattr(ret, key, value.canonical())
# value will be copied twice, should not be a performance issue anyway
elif isinstance(value, Path):
setattr(ret, key, str(value))
return ret
def validate(self) -> None:
......
......@@ -5,6 +5,8 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from ruamel.yaml import YAML
from .base import ConfigBase, PathLike
from . import util
......@@ -27,23 +29,27 @@ class _AlgorithmConfig(ConfigBase):
super().validate()
_validate_algo(self)
@dataclass(init=False)
class AlgorithmConfig(_AlgorithmConfig):
name: str
class_args: Optional[Dict[str, Any]] = None
@dataclass(init=False)
class CustomAlgorithmConfig(_AlgorithmConfig):
class_name: str
class_directory: Optional[PathLike] = None
class_directory: Optional[PathLike] = '.'
class_args: Optional[Dict[str, Any]] = None
class TrainingServiceConfig(ConfigBase):
platform: str
class SharedStorageConfig(ConfigBase):
storage_type: str
local_mount_point: str
remote_mount_point: str
local_mounted: str
@dataclass(init=False)
class ExperimentConfig(ConfigBase):
......@@ -53,19 +59,21 @@ class ExperimentConfig(ConfigBase):
trial_command: str
trial_code_directory: PathLike = '.'
trial_concurrency: int
trial_gpu_number: Optional[int] = None
trial_gpu_number: Optional[int] = None # TODO: in openpai cannot be None
max_experiment_duration: Optional[str] = None
max_trial_number: Optional[int] = None
nni_manager_ip: Optional[str] = None
use_annotation: bool = False
debug: bool = False
log_level: Optional[str] = None
experiment_working_directory: Optional[PathLike] = None
experiment_working_directory: PathLike = '~/nni-experiments'
tuner_gpu_indices: Optional[Union[List[int], str]] = None
tuner: Optional[_AlgorithmConfig] = None
assessor: Optional[_AlgorithmConfig] = None
advisor: Optional[_AlgorithmConfig] = None
training_service: Union[TrainingServiceConfig, List[TrainingServiceConfig]]
shared_storage: Optional[SharedStorageConfig] = None
_deprecated: Optional[Dict[str, Any]] = None
def __init__(self, training_service_platform: Optional[Union[str, List[str]]] = None, **kwargs):
base_path = kwargs.pop('_base_path', None)
......@@ -100,6 +108,12 @@ class ExperimentConfig(ConfigBase):
if self.training_service.use_active_gpu is None:
raise ValueError('Please set "use_active_gpu"')
def json(self) -> Dict[str, Any]:
obj = super().json()
if obj.get('searchSpaceFile'):
obj['searchSpace'] = YAML().load(open(obj.pop('searchSpaceFile')))
return obj
## End of public API ##
@property
......@@ -117,9 +131,9 @@ _canonical_rules = {
'max_experiment_duration': lambda value: f'{util.parse_time(value)}s' if value is not None else None,
'experiment_working_directory': util.canonical_path,
'tuner_gpu_indices': lambda value: [int(idx) for idx in value.split(',')] if isinstance(value, str) else value,
'tuner': lambda config: None if config is None or config.name == '_none_' else config,
'assessor': lambda config: None if config is None or config.name == '_none_' else config,
'advisor': lambda config: None if config is None or config.name == '_none_' else config,
'tuner': lambda config: None if config is None or config.name == '_none_' else config.canonical(),
'assessor': lambda config: None if config is None or config.name == '_none_' else config.canonical(),
'advisor': lambda config: None if config is None or config.name == '_none_' else config.canonical(),
}
_validation_rules = {
......
This diff is collapsed.
......@@ -56,7 +56,7 @@ class KubeflowConfig(TrainingServiceConfig):
parameter_server: Optional[KubeflowRoleConfig] = None
def __init__(self, **kwargs):
kwargs = util.case_insensitve(kwargs)
kwargs = util.case_insensitive(kwargs)
kwargs['storage'] = util.load_config(_KubeflowStorageConfig, kwargs.get('storage'))
kwargs['worker'] = util.load_config(KubeflowRoleConfig, kwargs.get('worker'))
kwargs['parameterserver'] = util.load_config(KubeflowRoleConfig, kwargs.get('parameterserver'))
......
......@@ -23,7 +23,7 @@ class OpenpaiConfig(TrainingServiceConfig):
docker_image: str = 'msranni/nni:latest'
local_storage_mount_point: PathLike
container_storage_mount_point: str
reuse_mode: bool = False
reuse_mode: bool = True
openpai_config: Optional[Dict[str, Any]] = None
openpai_config_file: Optional[PathLike] = None
......
......@@ -46,7 +46,7 @@ class RemoteMachineConfig(ConfigBase):
@dataclass(init=False)
class RemoteConfig(TrainingServiceConfig):
platform: str = 'remote'
reuse_mode: bool = False
reuse_mode: bool = True
machine_list: List[RemoteMachineConfig]
def __init__(self, **kwargs):
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from dataclasses import dataclass
from typing import Optional
from .common import SharedStorageConfig
__all__ = ['NfsConfig', 'AzureBlobConfig']
@dataclass(init=False)
class NfsConfig(SharedStorageConfig):
storage_type: str = 'NFS'
nfs_server: str
exported_directory: str
@dataclass(init=False)
class AzureBlobConfig(SharedStorageConfig):
storage_type: str = 'AzureBlob'
storage_account_name: str
storage_account_key: Optional[str] = None
resource_group_name: Optional[str] = None
container_name: str
......@@ -19,7 +19,7 @@ def case_insensitive(key_or_kwargs: Union[str, Dict[str, Any]]) -> Union[str, Di
return {key.lower().replace('_', ''): value for key, value in key_or_kwargs.items()}
def camel_case(key: str) -> str:
words = key.split('_')
words = key.strip('_').split('_')
return words[0] + ''.join(word.title() for word in words[1:])
def canonical_path(path: Optional[PathLike]) -> Optional[str]:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import contextlib
import logging
from pathlib import Path
......@@ -13,7 +16,6 @@ import nni_node # pylint: disable=import-error
import nni.runtime.protocol
from .config import ExperimentConfig
from .config import convert
from .pipe import Pipe
from . import rest
from ..tools.nnictl.config_utils import Experiments
......@@ -40,7 +42,7 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo
_save_experiment_information(exp_id, port, start_time, platform,
config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...')
_init_experiment(config, port, debug)
rest.post(port, '/experiment', config.json())
return proc
except Exception as e:
......@@ -75,7 +77,7 @@ def start_experiment_retiarii(exp_id: str, config: ExperimentConfig, port: int,
_save_experiment_information(exp_id, port, start_time, platform,
config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...')
_init_experiment(config, port, debug)
rest.post(port, '/experiment', config.json())
return proc, pipe
except Exception as e:
......@@ -145,12 +147,6 @@ def _check_rest_server(port: int, retry: int = 3) -> None:
rest.get(port, '/check-status')
def _init_experiment(config: ExperimentConfig, port: int, debug: bool) -> None:
for cluster_metadata in convert.to_cluster_metadata(config):
rest.put(port, '/experiment/cluster-metadata', cluster_metadata)
rest.post(port, '/experiment', convert.to_rest_json(config))
def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None:
experiments_config = Experiments()
experiments_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir)
......@@ -35,11 +35,17 @@ def verify_algo_import(meta):
def algo_reg(args):
meta_list = read_reg_meta_list(args.meta_path)
for meta in meta_list:
if get_registered_algo_meta(meta['builtinName']) is not None:
print_error('builtinName {} already registered'.format(meta['builtinName']))
return
verify_algo_import(meta)
save_algo_meta_data(meta)
old = get_registered_algo_meta(meta['builtinName'])
if old is None:
verify_algo_import(meta)
save_algo_meta_data(meta)
elif old['source'] != 'nni':
verify_algo_import(meta)
print_green(f'Updating exist algorithm')
remove_algo_meta_data(meta['builtinName'])
save_algo_meta_data(meta)
else:
print_error(f'Cannot overwrite builtin algorithm')
print_green('{} registered sucessfully!'.format(meta['builtinName']))
def algo_unreg(args):
......
This diff is collapsed.
......@@ -124,4 +124,5 @@ def validate_all_content(experiment_config, config_path):
NNIConfigSchema().validate(experiment_config)
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
if 'maxExecDuration' in experiment_config:
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
......@@ -178,25 +178,24 @@ def create_customized_class_instance(class_params):
----------
class_params: dict
class_params should contains following keys:
codeDir: code directory
classFileName: python file name of the class
className: class name
codeDirectory: code directory
className: qualified class name
classArgs (optional): kwargs pass to class constructor
Returns: object
-------
Returns customized class instance.
"""
code_dir = class_params.get('codeDir')
class_filename = class_params.get('classFileName')
class_name = class_params.get('className')
code_dir = class_params.get('classDirectory')
qualified_class_name = class_params.get('className')
class_args = class_params.get('classArgs')
if not os.path.isfile(os.path.join(code_dir, class_filename)):
raise ValueError('Class file not found: {}'.format(
os.path.join(code_dir, class_filename)))
if code_dir and not os.path.isdir(code_dir):
raise ValueError(f'Directory not found: {code_dir}')
sys.path.append(code_dir)
module_name = os.path.splitext(class_filename)[0]
module_name, class_name = qualified_class_name.rsplit('.', 1)
class_module = importlib.import_module(module_name)
class_constructor = getattr(class_module, class_name)
......
......@@ -45,13 +45,6 @@ testCases:
- name: multi-thread
configFile: test/config/multi_thread/config.yml
- name: multi-phase-batch
configFile: test/config/multi_phase/batch.yml
config:
# for batch tuner, maxTrialNum can not exceed length of search space
maxTrialNum: 2
trialConcurrency: 2
#########################################################################
# nni assessor test
#########################################################################
......
......@@ -30,7 +30,8 @@
"argsIgnorePattern": "^_"
}
],
"@typescript-eslint/no-var-requires": 0
"@typescript-eslint/no-var-requires": 0,
"@typescript-eslint/no-non-null-assertion": 0
},
"ignorePatterns": [
"node_modules/",
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as assert from 'assert';
export interface TrainingServiceConfig {
platform: string;
}
/* Local */
export interface LocalConfig extends TrainingServiceConfig {
platform: 'local';
useActiveGpu?: boolean;
maxTrialNumberPerGpu: number;
gpuIndices?: number[];
}
/* Remote */
export interface RemoteMachineConfig {
host: string;
port: number;
user: string;
password?: string;
sshKeyFile: string;
sshPassphrase?: string;
useActiveGpu: boolean;
maxTrialNumberPerGpu: number;
gpuIndices?: number[];
pythonPath?: string;
}
export interface RemoteConfig extends TrainingServiceConfig {
platform: 'remote';
reuseMode: boolean;
machineList: RemoteMachineConfig[];
}
/* OpenPAI */
export interface OpenpaiConfig extends TrainingServiceConfig {
platform: 'openpai';
host: string;
username: string;
token: string;
trialCpuNumber: number;
trialMemorySize: string;
storageConfigName: string;
dockerImage: string;
localStorageMountPoint: string;
containerStorageMountPoint: string;
reuseMode: boolean;
openpaiConfig?: object;
}
/* AML */
export interface AmlConfig extends TrainingServiceConfig {
platform: 'aml';
subscriptionId: string;
resourceGroup: string;
workspaceName: string;
computeTarget: string;
dockerImage: string;
}
/* Kubeflow */
// FIXME: merge with shared storage config
export interface KubeflowStorageConfig {
storage: string;
server?: string;
path?: string;
azureAccount?: string;
azureShare?: string;
keyVault?: string;
keyVaultSecret?: string;
}
export interface KubeflowRoleConfig {
replicas: number;
command: string;
gpuNumber: number;
cpuNumber: number;
memorySize: string;
dockerImage: string;
}
export interface KubeflowConfig extends TrainingServiceConfig {
platform: 'kubeflow';
operator: string;
apiVersion: string;
storage: KubeflowStorageConfig;
worker: KubeflowRoleConfig;
parameterServer?: KubeflowRoleConfig;
}
/* FrameworkController */
type FrameworkControllerStorageConfig = KubeflowStorageConfig;
export interface FrameworkControllerRoleConfig {
name: string;
dockerImage: string;
taskNumber: number;
command: string;
gpuNumber: number;
cpuNumber: number;
memorySize: string;
attemptCompletionMinFailedTasks: number;
attemptCompletionMinSucceededTasks: number;
}
export interface FrameworkControllerConfig extends TrainingServiceConfig {
platform: 'frameworkcontroller';
serviceAccountName: string;
storage: FrameworkControllerStorageConfig;
taskRoles: FrameworkControllerRoleConfig[];
}
/* shared storage */
export interface SharedStorageConfig {
storageType: string;
localMountPoint: string;
remoteMountPoint: string;
localMounted: string;
}
export interface NfsConfig extends SharedStorageConfig {
storageType: 'NFS';
nfsServer: string;
exportedDirectory: string;
}
export interface AzureBlobConfig extends SharedStorageConfig {
storageAccountName: string;
storageAccountKey?: string;
resourceGroupName?: string;
containerName: string;
}
/* common */
export interface AlgorithmConfig {
name?: string;
className?: string;
codeDirectory?: string;
classArgs?: object;
}
export interface ExperimentConfig {
experimentName?: string;
searchSpace: any;
trialCommand: string;
trialCodeDirectory: string;
trialConcurrency: number;
trialGpuNumber?: number;
maxExperimentDuration?: string;
maxTrialNumber?: number;
nniManagerIp?: string;
//useAnnotation: boolean; // dealed inside nnictl
debug: boolean;
logLevel?: string;
experimentWorkingDirectory?: string;
tunerGpuIndices?: number[];
tuner?: AlgorithmConfig;
assessor?: AlgorithmConfig;
advisor?: AlgorithmConfig;
trainingService: TrainingServiceConfig | TrainingServiceConfig[];
sharedStorage?: SharedStorageConfig;
deprecated?: any; // configs that are not yet natively supported by v2 (workaround)
}
/* util functions */
const timeUnits = { d: 24 * 3600, h: 3600, m: 60, s: 1 };
export function toSeconds(time: string): number {
for (const [unit, factor] of Object.entries(timeUnits)) {
if (time.toLowerCase().endsWith(unit)) {
const digits = time.slice(0, -1);
return Number(digits) * factor;
}
}
throw new Error(`Bad time string "${time}"`);
}
const sizeUnits = { tb: 1024 * 1024, gb: 1024 * 1024, mb: 1, kb: 1 / 1024 };
export function toMegaBytes(size: string): number {
for (const [unit, factor] of Object.entries(sizeUnits)) {
if (size.toLowerCase().endsWith(unit)) {
const digits = size.slice(0, -2);
return Math.floor(Number(digits) * factor);
}
}
throw new Error(`Bad size string "${size}"`);
}
export function toCudaVisibleDevices(gpuIndices?: number[]): string {
return gpuIndices === undefined ? '' : gpuIndices.join(',');
}
export function flattenConfig<T>(config: ExperimentConfig, platform: string): T {
const flattened = { };
Object.assign(flattened, config);
if (Array.isArray(config.trainingService)) {
for (const trainingService of config.trainingService) {
if (trainingService.platform === platform) {
Object.assign(flattened, trainingService);
}
}
} else {
assert(config.trainingService.platform === platform);
Object.assign(flattened, config.trainingService);
}
return <T>flattened;
}
......@@ -17,8 +17,14 @@ const INFO: number = 4;
const DEBUG: number = 5;
const TRACE: number = 6;
const logLevelNameMap: Map<string, number> = new Map([['fatal', FATAL],
['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]);
const logLevelNameMap: Map<string, number> = new Map([
['fatal', FATAL],
['error', ERROR],
['warning', WARNING],
['info', INFO],
['debug', DEBUG],
['trace', TRACE]
]);
class BufferSerialEmitter {
private buffer: Buffer;
......
......@@ -5,6 +5,7 @@
import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore';
import { TrialJobStatus, LogType } from './trainingService';
import { ExperimentConfig } from './experimentConfig';
type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
......@@ -13,58 +14,12 @@ namespace ExperimentStartUpMode {
export const RESUME = 'resume';
}
interface ExperimentParams {
authorName: string;
experimentName: string;
description?: string;
trialConcurrency: number;
maxExecDuration: number; //seconds
maxTrialNum: number;
searchSpace: string;
trainingServicePlatform: string;
multiPhase?: boolean;
multiThread?: boolean;
versionCheck?: boolean;
logCollection?: string;
tuner?: {
className?: string;
builtinTunerName?: string;
codeDir?: string;
classArgs?: any;
classFileName?: string;
checkpointDir: string;
includeIntermediateResults?: boolean;
gpuIndices?: string;
};
assessor?: {
className?: string;
builtinAssessorName?: string;
codeDir?: string;
classArgs?: any;
classFileName?: string;
checkpointDir: string;
};
advisor?: {
className?: string;
builtinAdvisorName?: string;
codeDir?: string;
classArgs?: any;
classFileName?: string;
checkpointDir: string;
gpuIndices?: string;
};
clusterMetaData?: {
key: string;
value: string;
}[];
}
interface ExperimentProfile {
params: ExperimentParams;
params: ExperimentConfig;
id: string;
execDuration: number;
logDir?: string;
startTime?: number;
logDir: string;
startTime: number;
endTime?: number;
nextSequenceId: number;
revision: number;
......@@ -81,7 +36,7 @@ interface NNIManagerStatus {
}
abstract class Manager {
public abstract startExperiment(experimentParams: ExperimentParams): Promise<string>;
public abstract startExperiment(experimentConfig: ExperimentConfig): Promise<string>;
public abstract resumeExperiment(readonly: boolean): Promise<void>;
public abstract stopExperiment(): Promise<void>;
public abstract stopExperimentTopHalf(): Promise<void>;
......@@ -113,4 +68,4 @@ abstract class Manager {
public abstract fetchTrialOutput(trialJobId: string, subpath: string): Promise<void>;
}
export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus, ExperimentStartUpMode };
export { Manager, ExperimentConfig, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus, ExperimentStartUpMode };
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment