"...composable_kernel_onnxruntime.git" did not exist on "d7c84daf66d468f8781a0fadfaf2f9fe0f74c49c"
Unverified Commit 1fec96c9 authored by QuanluZhang's avatar QuanluZhang Committed by GitHub
Browse files

fix state transition (#504)

parent d5f808b0
...@@ -85,7 +85,7 @@ interface TrialJobStatistics { ...@@ -85,7 +85,7 @@ interface TrialJobStatistics {
} }
interface NNIManagerStatus { interface NNIManagerStatus {
status: 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL'; status: 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
errors: string[]; errors: string[];
} }
......
...@@ -425,14 +425,10 @@ class NNIManager implements Manager { ...@@ -425,14 +425,10 @@ class NNIManager implements Manager {
throw new Error('Error: tuner has not been setup'); throw new Error('Error: tuner has not been setup');
} }
let allFinishedTrialJobNum: number = 0; let allFinishedTrialJobNum: number = 0;
let waitSubmittedToFinish: number;
while (this.status.status !== 'STOPPING' && this.status.status !== 'STOPPED') { while (this.status.status !== 'STOPPING' && this.status.status !== 'STOPPED') {
const finishedTrialJobNum: number = await this.requestTrialJobsStatus(); const finishedTrialJobNum: number = await this.requestTrialJobsStatus();
allFinishedTrialJobNum += finishedTrialJobNum; allFinishedTrialJobNum += finishedTrialJobNum;
if (allFinishedTrialJobNum >= this.experimentProfile.params.maxTrialNum) {
// write this log for travis CI
this.log.info('Experiment done.');
}
// requestTrialNum is the number of trials that will be requested from tuner. // requestTrialNum is the number of trials that will be requested from tuner.
// If trialConcurrency does not change, requestTrialNum equals finishedTrialJobNum. // If trialConcurrency does not change, requestTrialNum equals finishedTrialJobNum.
...@@ -467,21 +463,29 @@ class NNIManager implements Manager { ...@@ -467,21 +463,29 @@ class NNIManager implements Manager {
// as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL. // as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL.
assert(this.status.status === 'RUNNING' || assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' || this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL'); this.status.status === 'NO_MORE_TRIAL' ||
this.status.status === 'TUNER_NO_MORE_TRIAL');
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration || if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status === 'RUNNING' || if (this.status.status !== 'DONE') {
this.status.status === 'NO_MORE_TRIAL') { this.status.status = 'NO_MORE_TRIAL';
waitSubmittedToFinish = this.currSubmittedTrialNum;
assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
this.status.status = 'DONE';
this.experimentProfile.endTime = Date.now(); this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile(); await this.storeExperimentProfile();
// write this log for travis CI
this.log.info('Experiment done.');
}
} }
this.status.status = 'DONE';
} else { } else {
if (this.status.status === 'DONE') { if (this.status.status === 'DONE') {
delete this.experimentProfile.endTime; delete this.experimentProfile.endTime;
await this.storeExperimentProfile(); await this.storeExperimentProfile();
} }
if (this.status.status !== 'NO_MORE_TRIAL') { if (this.status.status !== 'TUNER_NO_MORE_TRIAL') {
this.status.status = 'RUNNING'; this.status.status = 'RUNNING';
} }
for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) { for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
...@@ -602,7 +606,7 @@ class NNIManager implements Manager { ...@@ -602,7 +606,7 @@ class NNIManager implements Manager {
this.requestTrialJobs(this.experimentProfile.params.trialConcurrency); this.requestTrialJobs(this.experimentProfile.params.trialConcurrency);
break; break;
case NEW_TRIAL_JOB: case NEW_TRIAL_JOB:
if (this.status.status === 'NO_MORE_TRIAL') { if (this.status.status === 'TUNER_NO_MORE_TRIAL') {
this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set'); this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
this.status.status = 'RUNNING'; this.status.status = 'RUNNING';
} }
...@@ -625,7 +629,7 @@ class NNIManager implements Manager { ...@@ -625,7 +629,7 @@ class NNIManager implements Manager {
'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined); 'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
break; break;
case NO_MORE_TRIAL_JOBS: case NO_MORE_TRIAL_JOBS:
this.status.status = 'NO_MORE_TRIAL'; this.status.status = 'TUNER_NO_MORE_TRIAL';
break; break;
case KILL_TRIAL_JOB: case KILL_TRIAL_JOB:
await this.trainingService.cancelTrialJob(JSON.parse(content), true); await this.trainingService.cancelTrialJob(JSON.parse(content), true);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment