Unverified Commit f56f688b authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Fix sequence id issue on resuming experiment (#316)

parent 06710abd
......@@ -27,6 +27,7 @@ class ExperimentStartupInfo {
private experimentId: string = '';
private newExperiment: boolean = true;
private initialized: boolean = false;
private initTrialSequenceID: number = 0;
public setStartupInfo(newExperiment: boolean, experimentId: string): void {
assert(!this.initialized);
......@@ -48,6 +49,17 @@ class ExperimentStartupInfo {
return this.newExperiment;
}
public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized);
this.initTrialSequenceID = initSequenceId;
}
public getInitTrialSequenceId(): number {
assert(this.initialized);
return this.initTrialSequenceID;
}
}
function getExperimentId(): string {
......@@ -58,8 +70,17 @@ function isNewExperiment(): boolean {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isNewExperiment();
}
function setInitTrialSequenceId(initSequenceId: number): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setInitTrialSequenceId(initSequenceId);
}
function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
}
function setExperimentStartupInfo(newExperiment: boolean, experimentId: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId);
}
export { ExperimentStartupInfo, getExperimentId, isNewExperiment, setExperimentStartupInfo };
export { ExperimentStartupInfo, getExperimentId, isNewExperiment,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
......@@ -65,6 +65,7 @@ interface ExperimentProfile {
logDir?: string;
startTime?: number;
endTime?: number;
maxSequenceId: number;
revision: number;
}
......
......@@ -26,7 +26,7 @@ import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
import { NNIError } from '../common/errors';
import { getExperimentId } from '../common/experimentStartupInfo';
import { getExperimentId, setInitTrialSequenceId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
ExperimentParams, ExperimentProfile, Manager,
......@@ -152,6 +152,8 @@ class NNIManager implements Manager {
this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
const expParams: ExperimentParams = this.experimentProfile.params;
setInitTrialSequenceId(this.experimentProfile.maxSequenceId + 1);
// Set up multiphase config
if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
......@@ -462,6 +464,7 @@ class NNIManager implements Manager {
}
};
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
await this.storeMaxSequenceId(trialJobDetail.sequenceId);
this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
if (trialJobDetailSnapshot != undefined) {
......@@ -593,6 +596,7 @@ class NNIManager implements Manager {
revision: 0,
execDuration: 0,
logDir: getLogDir(),
maxSequenceId: 0,
params: {
authorName: '',
experimentName: '',
......@@ -609,6 +613,13 @@ class NNIManager implements Manager {
}
};
}
private async storeMaxSequenceId(sequenceId: number): Promise<void> {
if (sequenceId > this.experimentProfile.maxSequenceId) {
this.experimentProfile.maxSequenceId = sequenceId;
await this.storeExperimentProfile();
}
}
}
export { NNIManager };
......@@ -53,6 +53,7 @@ create table ExperimentProfile (
startTime integer,
endTime integer,
logDir text,
maxSequenceId integer,
revision integer);
create index ExperimentProfile_id on ExperimentProfile(id);
`;
......@@ -65,6 +66,7 @@ function loadExperimentProfile(row: any): ExperimentProfile {
startTime: row.startTime === null ? undefined : row.startTime,
endTime: row.endTime === null ? undefined : row.endTime,
logDir: row.logDir === null ? undefined : row.logDir,
maxSequenceId: row.maxSequenceId,
revision: row.revision
};
}
......@@ -131,7 +133,7 @@ class SqlDB implements Database {
}
public storeExperimentProfile(exp: ExperimentProfile): Promise<void> {
const sql: string = 'insert into ExperimentProfile values (?,?,?,?,?,?,?)';
const sql: string = 'insert into ExperimentProfile values (?,?,?,?,?,?,?,?)';
const args: any[] = [
JSON.stringify(exp.params),
exp.id,
......@@ -139,6 +141,7 @@ class SqlDB implements Database {
exp.startTime === undefined ? null : exp.startTime,
exp.endTime === undefined ? null : exp.endTime,
exp.logDir === undefined ? null : exp.logDir,
exp.maxSequenceId,
exp.revision
];
......
......@@ -79,6 +79,7 @@ describe('Unit test for dataStore', () => {
execDuration: 0,
startTime: Date.now(),
endTime: Date.now(),
maxSequenceId: 0,
revision: 0
}
const id: string = profile.id;
......
......@@ -64,10 +64,10 @@ const expParams2: ExperimentParams = {
};
const profiles: ExperimentProfile[] = [
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: undefined, revision: 1 },
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 3 }
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: undefined, revision: 1, maxSequenceId: 0 },
{ params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2, maxSequenceId: 0 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2, maxSequenceId: 0 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 3, maxSequenceId: 0 }
];
const events: TrialJobEventRecord[] = [
......
......@@ -147,6 +147,7 @@ export class MockedNNIManager extends Manager {
execDuration: 0,
startTime: Date.now(),
endTime: Date.now(),
maxSequenceId: 0,
revision: 0
};
......
......@@ -30,12 +30,12 @@ import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common
import { getLogger, Logger } from '../../common/log';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import {
HostJobApplicationForm, JobApplicationForm, HyperParameters, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../common/utils';
import { file } from 'tmp';
const tkill = require('tree-kill');
......@@ -111,7 +111,7 @@ class LocalTrainingService implements TrainingService {
this.initialized = false;
this.stopping = false;
this.log = getLogger();
this.trialSequenceId = 0;
this.trialSequenceId = -1;
}
public async run(): Promise<void> {
......@@ -432,6 +432,10 @@ class LocalTrainingService implements TrainingService {
}
private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}
return this.trialSequenceId++;
}
......
......@@ -29,7 +29,7 @@ import * as request from 'request';
import { Deferred } from 'ts-deferred';
import { EventEmitter } from 'events';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { HDFSClientUtility } from './hdfsClientUtility'
import { MethodNotImplementedError } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
......@@ -78,7 +78,7 @@ class PAITrainingService implements TrainingService {
this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.trialSequenceId = 0;
this.trialSequenceId = -1;
}
public async run(): Promise<void> {
......@@ -454,6 +454,10 @@ class PAITrainingService implements TrainingService {
}
private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}
return this.trialSequenceId++;
}
}
......
......@@ -30,7 +30,7 @@ import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
......@@ -77,7 +77,7 @@ class RemoteMachineTrainingService implements TrainingService {
this.remoteExpRootDir = this.getRemoteExperimentRootDir();
this.timer = timer;
this.log = getLogger();
this.trialSequenceId = 0;
this.trialSequenceId = -1;
}
/**
......@@ -607,6 +607,10 @@ class RemoteMachineTrainingService implements TrainingService {
}
private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}
return this.trialSequenceId++;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment