Unverified Commit dde4d862 authored by Ni Hao's avatar Ni Hao Committed by GitHub
Browse files

Add maxTrialDuration (#3863)

parent 39432390
...@@ -20,6 +20,7 @@ This document describes the rules to write the config file, and provides some ex ...@@ -20,6 +20,7 @@ This document describes the rules to write the config file, and provides some ex
* `versionCheck <#versioncheck>`__ * `versionCheck <#versioncheck>`__
* `debug <#debug>`__ * `debug <#debug>`__
* `maxTrialNum <#maxtrialnum>`__ * `maxTrialNum <#maxtrialnum>`__
* `maxTrialDuration <#maxtrialduration>`__
* `trainingServicePlatform <#trainingserviceplatform>`__ * `trainingServicePlatform <#trainingserviceplatform>`__
* `searchSpacePath <#searchspacepath>`__ * `searchSpacePath <#searchspacepath>`__
* `useAnnotation <#useannotation>`__ * `useAnnotation <#useannotation>`__
...@@ -254,7 +255,7 @@ maxExecDuration ...@@ -254,7 +255,7 @@ maxExecDuration
Optional. String. Default: 999d. Optional. String. Default: 999d.
**maxExecDuration** specifies the max duration time of an experiment. The unit of the time is {**s**\ **m**\ , **h**\ , **d**\ }, which means {*seconds*\ , *minutes*\ , *hours*\ , *days*\ }. **maxExecDuration** specifies the max duration time of an experiment. The unit of the time is {**s**\ , **m**\ , **h**\ , **d**\ }, which means {*seconds*\ , *minutes*\ , *hours*\ , *days*\ }.
Note: The maxExecDuration spec set the time of an experiment, not a trial job. If the experiment reach the max duration time, the experiment will not stop, but could not submit new trial jobs any more. Note: The maxExecDuration spec set the time of an experiment, not a trial job. If the experiment reach the max duration time, the experiment will not stop, but could not submit new trial jobs any more.
...@@ -279,6 +280,13 @@ Optional. Integer between 1 and 99999. Default: 99999. ...@@ -279,6 +280,13 @@ Optional. Integer between 1 and 99999. Default: 99999.
Specifies the max number of trial jobs created by NNI, including succeeded and failed jobs. Specifies the max number of trial jobs created by NNI, including succeeded and failed jobs.
maxTrialDuration
^^^^^^^^^^^^^^^^
Optional. String. Default: 999d.
**maxTrialDuration** specifies the max duration time of each trial job. The unit of the time is {**s**\ , **m**\ , **h**\ , **d**\ }, which means {*seconds*\ , *minutes*\ , *hours*\ , *days*\ }. If current trial job reach the max duration time, this trial job will stop.
trainingServicePlatform trainingServicePlatform
^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^
......
...@@ -214,6 +214,20 @@ type: ``Optional[int]`` ...@@ -214,6 +214,20 @@ type: ``Optional[int]``
When the budget runs out, the experiment will stop creating trials but continue to serve WebUI. When the budget runs out, the experiment will stop creating trials but continue to serve WebUI.
maxTrialDuration
---------------------
Limit the duration of trial job if specified.
type: ``Optional[str]``
format: ``number + s|m|h|d``
examples: ``"10m"``, ``"0.5h"``
When time runs out, the current trial job will stop.
nniManagerIp nniManagerIp
------------ ------------
......
...@@ -65,6 +65,7 @@ class ExperimentConfig(ConfigBase): ...@@ -65,6 +65,7 @@ class ExperimentConfig(ConfigBase):
trial_gpu_number: Optional[int] = None # TODO: in openpai cannot be None trial_gpu_number: Optional[int] = None # TODO: in openpai cannot be None
max_experiment_duration: Optional[str] = None max_experiment_duration: Optional[str] = None
max_trial_number: Optional[int] = None max_trial_number: Optional[int] = None
max_trial_duration: Optional[int] = None
nni_manager_ip: Optional[str] = None nni_manager_ip: Optional[str] = None
use_annotation: bool = False use_annotation: bool = False
debug: bool = False debug: bool = False
...@@ -153,6 +154,7 @@ _validation_rules = { ...@@ -153,6 +154,7 @@ _validation_rules = {
'trial_gpu_number': lambda value: value >= 0, 'trial_gpu_number': lambda value: value >= 0,
'max_experiment_duration': lambda value: util.parse_time(value) > 0, 'max_experiment_duration': lambda value: util.parse_time(value) > 0,
'max_trial_number': lambda value: value > 0, 'max_trial_number': lambda value: value > 0,
'max_trial_duration': lambda value: util.parse_time(value) > 0,
'log_level': lambda value: value in ["trace", "debug", "info", "warning", "error", "fatal"], 'log_level': lambda value: value in ["trace", "debug", "info", "warning", "error", "fatal"],
'tuner_gpu_indices': lambda value: all(i >= 0 for i in value) and len(value) == len(set(value)), 'tuner_gpu_indices': lambda value: all(i >= 0 for i in value) and len(value) == len(set(value)),
'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class') 'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class')
......
...@@ -27,6 +27,9 @@ def to_v2(v1) -> ExperimentConfig: ...@@ -27,6 +27,9 @@ def to_v2(v1) -> ExperimentConfig:
if isinstance(v2.max_experiment_duration, (int, float)): if isinstance(v2.max_experiment_duration, (int, float)):
v2.max_experiment_duration = str(v2.max_experiment_duration) + 's' v2.max_experiment_duration = str(v2.max_experiment_duration) + 's'
_move_field(v1, v2, 'maxTrialNum', 'max_trial_number') _move_field(v1, v2, 'maxTrialNum', 'max_trial_number')
_move_field(v1, v2, 'maxTrialDuration', 'max_trial_duration')
if isinstance(v2.max_trial_duration, (int, float)):
v2.max_trial_duration = str(v2.max_trial_duration) + 's'
_move_field(v1, v2, 'searchSpacePath', 'search_space_file') _move_field(v1, v2, 'searchSpacePath', 'search_space_file')
assert not v1.pop('multiPhase', None), 'Multi-phase is no longer supported' assert not v1.pop('multiPhase', None), 'Multi-phase is no longer supported'
_deprecate(v1, v2, 'multiThread') _deprecate(v1, v2, 'multiThread')
......
...@@ -127,6 +127,7 @@ common_schema = { ...@@ -127,6 +127,7 @@ common_schema = {
Optional('description'): setType('description', str), Optional('description'): setType('description', str),
'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999), 'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999),
Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxTrialDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
'trainingServicePlatform': setChoice( 'trainingServicePlatform': setChoice(
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'), 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'),
......
...@@ -250,6 +250,7 @@ def set_experiment_v1(experiment_config, mode, port, config_file_name): ...@@ -250,6 +250,7 @@ def set_experiment_v1(experiment_config, mode, port, config_file_name):
request_data['maxExecDuration'] = experiment_config['maxExecDuration'] request_data['maxExecDuration'] = experiment_config['maxExecDuration']
request_data['maxExperimentDuration'] = str(experiment_config['maxExecDuration']) + 's' request_data['maxExperimentDuration'] = str(experiment_config['maxExecDuration']) + 's'
request_data['maxTrialNum'] = experiment_config['maxTrialNum'] request_data['maxTrialNum'] = experiment_config['maxTrialNum']
request_data['maxTrialDuration'] = experiment_config['maxTrialDuration']
request_data['maxTrialNumber'] = experiment_config['maxTrialNum'] request_data['maxTrialNumber'] = experiment_config['maxTrialNum']
request_data['searchSpace'] = experiment_config.get('searchSpace') request_data['searchSpace'] = experiment_config.get('searchSpace')
request_data['trainingServicePlatform'] = experiment_config.get('trainingServicePlatform') request_data['trainingServicePlatform'] = experiment_config.get('trainingServicePlatform')
......
...@@ -110,6 +110,8 @@ def set_default_values(experiment_config): ...@@ -110,6 +110,8 @@ def set_default_values(experiment_config):
experiment_config['maxExecDuration'] = '999d' experiment_config['maxExecDuration'] = '999d'
if experiment_config.get('maxTrialNum') is None: if experiment_config.get('maxTrialNum') is None:
experiment_config['maxTrialNum'] = 99999 experiment_config['maxTrialNum'] = 99999
if experiment_config.get('maxTrialDuration') is None:
experiment_config['maxTrialDuration'] = '999d'
if experiment_config['trainingServicePlatform'] == 'remote' or \ if experiment_config['trainingServicePlatform'] == 'remote' or \
experiment_config['trainingServicePlatform'] == 'hybrid' and \ experiment_config['trainingServicePlatform'] == 'hybrid' and \
'remote' in experiment_config['hybridConfig']['trainingServicePlatforms']: 'remote' in experiment_config['hybridConfig']['trainingServicePlatforms']:
...@@ -126,3 +128,5 @@ def validate_all_content(experiment_config, config_path): ...@@ -126,3 +128,5 @@ def validate_all_content(experiment_config, config_path):
if 'maxExecDuration' in experiment_config: if 'maxExecDuration' in experiment_config:
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration']) experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
if 'maxTrialDuration' in experiment_config:
experiment_config['maxTrialDuration'] = parse_time(experiment_config['maxTrialDuration'])
...@@ -161,6 +161,7 @@ export interface ExperimentConfig { ...@@ -161,6 +161,7 @@ export interface ExperimentConfig {
trialConcurrency: number; trialConcurrency: number;
trialGpuNumber?: number; trialGpuNumber?: number;
maxExperimentDuration?: string; maxExperimentDuration?: string;
maxTrialDuration?: string;
maxTrialNumber?: number; maxTrialNumber?: number;
nniManagerIp?: string; nniManagerIp?: string;
//useAnnotation: boolean; // dealed inside nnictl //useAnnotation: boolean; // dealed inside nnictl
......
...@@ -189,7 +189,6 @@ class NNIManager implements Manager { ...@@ -189,7 +189,6 @@ class NNIManager implements Manager {
this.log.debug(`dispatcher command: ${dispatcherCommand}`); this.log.debug(`dispatcher command: ${dispatcherCommand}`);
const checkpointDir: string = await this.createCheckpointDir(); const checkpointDir: string = await this.createCheckpointDir();
this.setupTuner(dispatcherCommand, undefined, 'start', checkpointDir); this.setupTuner(dispatcherCommand, undefined, 'start', checkpointDir);
this.setStatus('RUNNING'); this.setStatus('RUNNING');
await this.storeExperimentProfile(); await this.storeExperimentProfile();
this.run().catch((err: Error) => { this.run().catch((err: Error) => {
...@@ -433,6 +432,11 @@ class NNIManager implements Manager { ...@@ -433,6 +432,11 @@ class NNIManager implements Manager {
return (value === undefined ? Infinity : value); return (value === undefined ? Infinity : value);
} }
private get maxTrialDuration(): number {
const value = this.experimentProfile.params.maxTrialDuration;
return (value === undefined ? Infinity : toSeconds(value));
}
private async initTrainingService(config: ExperimentConfig): Promise<TrainingService> { private async initTrainingService(config: ExperimentConfig): Promise<TrainingService> {
let platform: string; let platform: string;
if (Array.isArray(config.trainingService)) { if (Array.isArray(config.trainingService)) {
...@@ -539,6 +543,17 @@ class NNIManager implements Manager { ...@@ -539,6 +543,17 @@ class NNIManager implements Manager {
} }
} }
private async stopTrialJobIfOverMaxDurationTimer(trialJobId: string): Promise<void> {
const trialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
if(undefined !== trialJobDetail &&
trialJobDetail.status === 'RUNNING' &&
trialJobDetail.startTime !== undefined){
const isEarlyStopped = true;
await this.trainingService.cancelTrialJob(trialJobId, isEarlyStopped);
this.log.info(`Trial job ${trialJobId} has stoped because it is over maxTrialDuration.`);
}
}
private async requestTrialJobsStatus(): Promise<number> { private async requestTrialJobsStatus(): Promise<number> {
let finishedTrialJobNum: number = 0; let finishedTrialJobNum: number = 0;
if (this.dispatcher === undefined) { if (this.dispatcher === undefined) {
...@@ -662,6 +677,7 @@ class NNIManager implements Manager { ...@@ -662,6 +677,7 @@ class NNIManager implements Manager {
this.currSubmittedTrialNum++; this.currSubmittedTrialNum++;
this.log.info('submitTrialJob: form:', form); this.log.info('submitTrialJob: form:', form);
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(form); const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(form);
setTimeout(async ()=> this.stopTrialJobIfOverMaxDurationTimer(trialJobDetail.id), 1000 * this.maxTrialDuration);
const Snapshot: TrialJobDetail = Object.assign({}, trialJobDetail); const Snapshot: TrialJobDetail = Object.assign({}, trialJobDetail);
await this.storeExperimentProfile(); await this.storeExperimentProfile();
this.trialJobs.set(trialJobDetail.id, Snapshot); this.trialJobs.set(trialJobDetail.id, Snapshot);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment