Unverified Commit 8c93890b authored by QuanluZhang's avatar QuanluZhang Committed by GitHub
Browse files

fix bug about execDuration and endTime (#270)

* fix bug about execDuration and endTime

* modify time interval to 30 seconds

* refactor based on Gems's suggestion

* for triggering ci
parent 60ad9400
...@@ -58,7 +58,6 @@ class NNIManager implements Manager { ...@@ -58,7 +58,6 @@ class NNIManager implements Manager {
private status: NNIManagerStatus; private status: NNIManagerStatus;
private waitingTrials: string[]; private waitingTrials: string[];
private trialJobs: Map<string, TrialJobDetail>; private trialJobs: Map<string, TrialJobDetail>;
private suspendDuration: number;
constructor() { constructor() {
this.currSubmittedTrialNum = 0; this.currSubmittedTrialNum = 0;
...@@ -69,7 +68,6 @@ class NNIManager implements Manager { ...@@ -69,7 +68,6 @@ class NNIManager implements Manager {
this.dispatcherPid = 0; this.dispatcherPid = 0;
this.waitingTrials = []; this.waitingTrials = [];
this.trialJobs = new Map<string, TrialJobDetail>(); this.trialJobs = new Map<string, TrialJobDetail>();
this.suspendDuration = 0;
this.log = getLogger(); this.log = getLogger();
this.dataStore = component.get(DataStore); this.dataStore = component.get(DataStore);
...@@ -336,14 +334,18 @@ class NNIManager implements Manager { ...@@ -336,14 +334,18 @@ class NNIManager implements Manager {
} }
private async periodicallyUpdateExecDuration(): Promise<void> { private async periodicallyUpdateExecDuration(): Promise<void> {
const startTime: number = Date.now(); let count: number = 1;
const execDuration: number = this.experimentProfile.execDuration;
for (; ;) { for (; ;) {
await delay(1000 * 60 * 10); // 10 minutes await delay(1000 * 1); // 1 seconds
this.experimentProfile.execDuration = execDuration + (Date.now() - startTime) / 1000 - this.suspendDuration; if (this.status.status === 'EXPERIMENT_RUNNING') {
this.experimentProfile.execDuration += 1;
if (count % 10 === 0) {
await this.storeExperimentProfile(); await this.storeExperimentProfile();
} }
} }
count += 1;
}
}
private async requestTrialJobsStatus(): Promise<number> { private async requestTrialJobsStatus(): Promise<number> {
const deferred: Deferred<number> = new Deferred<number>(); const deferred: Deferred<number> = new Deferred<number>();
...@@ -351,7 +353,6 @@ class NNIManager implements Manager { ...@@ -351,7 +353,6 @@ class NNIManager implements Manager {
for (const trialJobId of Array.from(this.trialJobs.keys())) { for (const trialJobId of Array.from(this.trialJobs.keys())) {
const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId); const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId); const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
//assert(oldTrialJobDetail);
if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) { if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail)); this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail.url); await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail.url);
...@@ -388,8 +389,6 @@ class NNIManager implements Manager { ...@@ -388,8 +389,6 @@ class NNIManager implements Manager {
throw new Error('Error: tuner has not been setup'); throw new Error('Error: tuner has not been setup');
} }
let allFinishedTrialJobNum: number = 0; let allFinishedTrialJobNum: number = 0;
const startTime: number = Date.now();
let suspendStartTime: number = 0;
for (; ;) { for (; ;) {
if (this.status.status === 'STOPPING') { if (this.status.status === 'STOPPING') {
break; break;
...@@ -426,18 +425,18 @@ class NNIManager implements Manager { ...@@ -426,18 +425,18 @@ class NNIManager implements Manager {
} }
// check maxtrialnum and maxduration here // check maxtrialnum and maxduration here
if ((Date.now() - startTime) / 1000 + this.experimentProfile.execDuration - this.suspendDuration if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
> this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
assert(this.status.status === 'EXPERIMENT_RUNNING' || this.status.status === 'DONE'); assert(this.status.status === 'EXPERIMENT_RUNNING' || this.status.status === 'DONE');
if (this.status.status === 'EXPERIMENT_RUNNING') { if (this.status.status === 'EXPERIMENT_RUNNING') {
suspendStartTime = Date.now(); this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
} }
this.status.status = 'DONE'; this.status.status = 'DONE';
} else { } else {
if (this.status.status === 'DONE') { if (this.status.status === 'DONE') {
assert(suspendStartTime !== 0); delete this.experimentProfile.endTime;
this.suspendDuration += (Date.now() - suspendStartTime) / 1000; await this.storeExperimentProfile();
} }
this.status.status = 'EXPERIMENT_RUNNING'; this.status.status = 'EXPERIMENT_RUNNING';
for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) { for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment