Unverified Commit 0a20c3fc authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Fix RemoteConfig bug and add save log to dispatcher.log for nni.Experiment (#3245)

parent 8a08fab6
......@@ -51,6 +51,10 @@ class RemoteConfig(TrainingServiceConfig):
kwargs['machinelist'] = util.load_config(RemoteMachineConfig, kwargs.get('machinelist'))
super().__init__(**kwargs)
_canonical_rules = {
'machine_list': lambda value: [config.canonical() for config in value]
}
_validation_rules = {
'platform': lambda value: (value == 'remote', 'cannot be modified')
}
......@@ -139,7 +139,7 @@ class Experiment:
"""
Stop background experiment.
"""
_logger.info('Stopping experiment...')
_logger.info('Stopping experiment, please wait...')
atexit.unregister(self.stop)
if self._proc is not None:
......@@ -155,6 +155,7 @@ class Experiment:
self._pipe = None
self._dispatcher = None
self._dispatcher_thread = None
_logger.info('Experiment stopped')
def run(self, port: int = 8080, debug: bool = False) -> bool:
......@@ -174,6 +175,8 @@ class Experiment:
return True
if status == 'ERROR':
return False
except KeyboardInterrupt:
_logger.warning('KeyboardInterrupt detected')
finally:
self.stop()
......
......@@ -46,6 +46,9 @@ def init_logger_experiment() -> None:
"""
formatter.format = _colorful_format
log_path = _prepare_log_dir(dispatcher_env_vars.NNI_LOG_DIRECTORY) / 'dispatcher.log'
_setup_root_logger(FileHandler(log_path), logging.DEBUG)
time_format = '%Y-%m-%d %H:%M:%S'
......
......@@ -450,15 +450,17 @@ class NNIManager implements Manager {
throw new Error('Error: tuner has not been setup');
}
this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
this.dispatcher.sendCommand(TERMINATE);
let tunerAlive: boolean = true;
// gracefully terminate tuner and assessor here, wait at most 30 seconds.
for (let i: number = 0; i < 30; i++) {
if (!tunerAlive) { break; }
tunerAlive = await isAlive(this.dispatcherPid);
await delay(1000);
}
await killPid(this.dispatcherPid);
if (this.dispatcherPid > 0) {
this.dispatcher.sendCommand(TERMINATE);
let tunerAlive: boolean = true;
// gracefully terminate tuner and assessor here, wait at most 30 seconds.
for (let i: number = 0; i < 30; i++) {
if (!tunerAlive) { break; }
tunerAlive = await isAlive(this.dispatcherPid);
await delay(1000);
}
await killPid(this.dispatcherPid);
}
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
// DON'T try to make it in parallel, the training service may not handle it well.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment